# Task 4: Machine Learning & Statistical Modeling

## Objectives:
1. For each zipcode, fit a linear regression model that predicts total claims
2. Develop ML model that predicts optimal premium values
3. Report on important features

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Cell 2: Load and prepare data for machine learning
print("Loading and preparing data for machine learning...")

# Load the data
df = pd.read_csv('../data/raw/insurance_data.txt', delimiter='|', low_memory=False)

# Convert key columns to numeric
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
df['SumInsured'] = pd.to_numeric(df['SumInsured'], errors='coerce')

# Filter out zero premiums and create working dataset
df_ml = df[df['TotalPremium'] > 0].copy()

# Basic info
print(f"Dataset shape: {df_ml.shape}")
print(f"Rows: {df_ml.shape[0]:,}")
print(f"Columns: {df_ml.shape[1]}")
print()

# Check target variables
print("Target variables:")
print(f"TotalPremium mean: R{df_ml['TotalPremium'].mean():.2f}")
print(f"TotalClaims mean: R{df_ml['TotalClaims'].mean():.2f}")
print(f"SumInsured mean: R{df_ml['SumInsured'].mean():.2f}")
print()

# Check unique zipcodes
if 'PostalCode' in df_ml.columns:
    print(f"Unique zipcodes: {df_ml['PostalCode'].nunique():,}")
    print(f"Zipcodes with at least 50 policies: {(df_ml['PostalCode'].value_counts() >= 50).sum():,}")

Loading and preparing data for machine learning...
Dataset shape: (618176, 52)
Rows: 618,176
Columns: 52

Target variables:
TotalPremium mean: R100.20
TotalClaims mean: R100.41
SumInsured mean: R609826.86

Unique zipcodes: 858
Zipcodes with at least 50 policies: 746


In [5]:
# Cell 3: Prepare features for modeling
print("=" * 60)
print("PREPARING FEATURES FOR MACHINE LEARNING")
print("=" * 60)

# Select features for modeling
# Based on domain knowledge and data availability
potential_features = [
    # Policy/Client features
    'SumInsured',
    'ExcessSelected',  # Might be categorical
    'CoverType',
    'Product',
    'TermFrequency',
    
    # Client demographics
    'Province',
    'PostalCode',
    'Gender',
    'MaritalStatus',
    'AccountType',
    
    # Vehicle features
    'VehicleType',
    'make',
    'Model',
    'RegistrationYear',
    'Cylinders',
    'cubiccapacity',
    'bodytype',
    'NumberOfDoors',
    
    # Risk features
    'AlarmImmobiliser',
    'TrackingDevice',
    'NewVehicle',
]

# Check which features are available
available_features = [f for f in potential_features if f in df_ml.columns]
print(f"Available features: {len(available_features)}")
print("Features available for modeling:")
for i, feat in enumerate(available_features, 1):
    print(f"  {i:2d}. {feat}")
print()

# Check for missing values in these features
print("Missing values in selected features:")
missing_counts = {}
for feat in available_features:
    missing = df_ml[feat].isnull().sum()
    if missing > 0:
        missing_counts[feat] = missing
        print(f"  {feat:25} {missing:8,} ({missing/len(df_ml)*100:.1f}%)")

if not missing_counts:
    print("  No missing values in selected features!")

PREPARING FEATURES FOR MACHINE LEARNING
Available features: 21
Features available for modeling:
   1. SumInsured
   2. ExcessSelected
   3. CoverType
   4. Product
   5. TermFrequency
   6. Province
   7. PostalCode
   8. Gender
   9. MaritalStatus
  10. AccountType
  11. VehicleType
  12. make
  13. Model
  14. RegistrationYear
  15. Cylinders
  16. cubiccapacity
  17. bodytype
  18. NumberOfDoors
  19. AlarmImmobiliser
  20. TrackingDevice
  21. NewVehicle

Missing values in selected features:
  Gender                       4,621 (0.7%)
  MaritalStatus                5,071 (0.8%)
  AccountType                 30,734 (5.0%)
  VehicleType                    218 (0.0%)
  make                           218 (0.0%)
  Model                          218 (0.0%)
  Cylinders                      218 (0.0%)
  cubiccapacity                  218 (0.0%)
  bodytype                       218 (0.0%)
  NumberOfDoors                  218 (0.0%)
  NewVehicle                  60,634 (9.8%)


In [6]:
# Cell 4: Create final modeling dataset
print("=" * 60)
print("CREATING FINAL MODELING DATASET")
print("=" * 60)

# Create a copy for modeling
df_model = df_ml.copy()

# 1. Select features (start with a manageable set)
selected_features = [
    'SumInsured',           # Important: higher sum insured = higher risk
    'Province',             # Geographic risk
    'VehicleType',          # Type of vehicle
    'CoverType',            # Type of coverage
    'TermFrequency',        # Payment frequency
    'AlarmImmobiliser',     # Security feature
]

# Also include zipcode for grouping
if 'PostalCode' in df_model.columns:
    selected_features.append('PostalCode')

# Target variables
target_claims = 'TotalClaims'
target_premium = 'TotalPremium'

print(f"Selected features: {selected_features}")
print(f"Target 1: {target_claims} (for claims prediction)")
print(f"Target 2: {target_premium} (for premium prediction)")
print()

# Check data types
print("Data types of selected features:")
for feat in selected_features:
    if feat in df_model.columns:
        dtype = df_model[feat].dtype
        unique = df_model[feat].nunique() if dtype == 'object' else 'N/A'
        print(f"  {feat:20} {str(dtype):10} Unique: {unique}")

CREATING FINAL MODELING DATASET
Selected features: ['SumInsured', 'Province', 'VehicleType', 'CoverType', 'TermFrequency', 'AlarmImmobiliser', 'PostalCode']
Target 1: TotalClaims (for claims prediction)
Target 2: TotalPremium (for premium prediction)

Data types of selected features:
  SumInsured           float64    Unique: N/A
  Province             object     Unique: 9
  VehicleType          object     Unique: 5
  CoverType            object     Unique: 21
  TermFrequency        object     Unique: 2
  AlarmImmobiliser     object     Unique: 2
  PostalCode           int64      Unique: N/A
