# Task 4: Machine Learning & Statistical Modeling

## Objectives:
1. For each zipcode, fit a linear regression model that predicts total claims
2. Develop ML model that predicts optimal premium values
3. Report on important features

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Cell 2: Load and prepare data for machine learning
print("Loading and preparing data for machine learning...")

# Load the data
df = pd.read_csv('../data/raw/insurance_data.txt', delimiter='|', low_memory=False)

# Convert key columns to numeric
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
df['SumInsured'] = pd.to_numeric(df['SumInsured'], errors='coerce')

# Filter out zero premiums and create working dataset
df_ml = df[df['TotalPremium'] > 0].copy()

# Basic info
print(f"Dataset shape: {df_ml.shape}")
print(f"Rows: {df_ml.shape[0]:,}")
print(f"Columns: {df_ml.shape[1]}")
print()

# Check target variables
print("Target variables:")
print(f"TotalPremium mean: R{df_ml['TotalPremium'].mean():.2f}")
print(f"TotalClaims mean: R{df_ml['TotalClaims'].mean():.2f}")
print(f"SumInsured mean: R{df_ml['SumInsured'].mean():.2f}")
print()

# Check unique zipcodes
if 'PostalCode' in df_ml.columns:
    print(f"Unique zipcodes: {df_ml['PostalCode'].nunique():,}")
    print(f"Zipcodes with at least 50 policies: {(df_ml['PostalCode'].value_counts() >= 50).sum():,}")

Loading and preparing data for machine learning...
Dataset shape: (618176, 52)
Rows: 618,176
Columns: 52

Target variables:
TotalPremium mean: R100.20
TotalClaims mean: R100.41
SumInsured mean: R609826.86

Unique zipcodes: 858
Zipcodes with at least 50 policies: 746


In [5]:
# Cell 3: Prepare features for modeling
print("=" * 60)
print("PREPARING FEATURES FOR MACHINE LEARNING")
print("=" * 60)

# Select features for modeling
# Based on domain knowledge and data availability
potential_features = [
    # Policy/Client features
    'SumInsured',
    'ExcessSelected',  # Might be categorical
    'CoverType',
    'Product',
    'TermFrequency',
    
    # Client demographics
    'Province',
    'PostalCode',
    'Gender',
    'MaritalStatus',
    'AccountType',
    
    # Vehicle features
    'VehicleType',
    'make',
    'Model',
    'RegistrationYear',
    'Cylinders',
    'cubiccapacity',
    'bodytype',
    'NumberOfDoors',
    
    # Risk features
    'AlarmImmobiliser',
    'TrackingDevice',
    'NewVehicle',
]

# Check which features are available
available_features = [f for f in potential_features if f in df_ml.columns]
print(f"Available features: {len(available_features)}")
print("Features available for modeling:")
for i, feat in enumerate(available_features, 1):
    print(f"  {i:2d}. {feat}")
print()

# Check for missing values in these features
print("Missing values in selected features:")
missing_counts = {}
for feat in available_features:
    missing = df_ml[feat].isnull().sum()
    if missing > 0:
        missing_counts[feat] = missing
        print(f"  {feat:25} {missing:8,} ({missing/len(df_ml)*100:.1f}%)")

if not missing_counts:
    print("  No missing values in selected features!")

PREPARING FEATURES FOR MACHINE LEARNING
Available features: 21
Features available for modeling:
   1. SumInsured
   2. ExcessSelected
   3. CoverType
   4. Product
   5. TermFrequency
   6. Province
   7. PostalCode
   8. Gender
   9. MaritalStatus
  10. AccountType
  11. VehicleType
  12. make
  13. Model
  14. RegistrationYear
  15. Cylinders
  16. cubiccapacity
  17. bodytype
  18. NumberOfDoors
  19. AlarmImmobiliser
  20. TrackingDevice
  21. NewVehicle

Missing values in selected features:
  Gender                       4,621 (0.7%)
  MaritalStatus                5,071 (0.8%)
  AccountType                 30,734 (5.0%)
  VehicleType                    218 (0.0%)
  make                           218 (0.0%)
  Model                          218 (0.0%)
  Cylinders                      218 (0.0%)
  cubiccapacity                  218 (0.0%)
  bodytype                       218 (0.0%)
  NumberOfDoors                  218 (0.0%)
  NewVehicle                  60,634 (9.8%)


In [6]:
# Cell 4: Create final modeling dataset
print("=" * 60)
print("CREATING FINAL MODELING DATASET")
print("=" * 60)

# Create a copy for modeling
df_model = df_ml.copy()

# 1. Select features (start with a manageable set)
selected_features = [
    'SumInsured',           # Important: higher sum insured = higher risk
    'Province',             # Geographic risk
    'VehicleType',          # Type of vehicle
    'CoverType',            # Type of coverage
    'TermFrequency',        # Payment frequency
    'AlarmImmobiliser',     # Security feature
]

# Also include zipcode for grouping
if 'PostalCode' in df_model.columns:
    selected_features.append('PostalCode')

# Target variables
target_claims = 'TotalClaims'
target_premium = 'TotalPremium'

print(f"Selected features: {selected_features}")
print(f"Target 1: {target_claims} (for claims prediction)")
print(f"Target 2: {target_premium} (for premium prediction)")
print()

# Check data types
print("Data types of selected features:")
for feat in selected_features:
    if feat in df_model.columns:
        dtype = df_model[feat].dtype
        unique = df_model[feat].nunique() if dtype == 'object' else 'N/A'
        print(f"  {feat:20} {str(dtype):10} Unique: {unique}")

CREATING FINAL MODELING DATASET
Selected features: ['SumInsured', 'Province', 'VehicleType', 'CoverType', 'TermFrequency', 'AlarmImmobiliser', 'PostalCode']
Target 1: TotalClaims (for claims prediction)
Target 2: TotalPremium (for premium prediction)

Data types of selected features:
  SumInsured           float64    Unique: N/A
  Province             object     Unique: 9
  VehicleType          object     Unique: 5
  CoverType            object     Unique: 21
  TermFrequency        object     Unique: 2
  AlarmImmobiliser     object     Unique: 2
  PostalCode           int64      Unique: N/A


In [7]:
# Cell 5: Simple Linear Regression for TotalClaims
print("=" * 60)
print("OBJECTIVE 1: LINEAR REGRESSION FOR TOTAL CLAIMS")
print("=" * 60)

# For linear regression, we need to handle categorical variables
# Let's start with just numerical features for simplicity
print("Starting with numerical features only...")

# Select numerical features for initial model
numerical_features = ['SumInsured']  # Start simple

# Check correlation with TotalClaims
print("\n1. Correlation analysis:")
for feat in numerical_features:
    if feat in df_model.columns:
        corr = df_model[feat].corr(df_model['TotalClaims'])
        print(f"   {feat:20} Correlation with Claims: {corr:.4f}")

# Prepare data
X = df_model[numerical_features].fillna(df_model[numerical_features].mean())
y = df_model['TotalClaims']

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\n2. Data split:")
print(f"   Training samples: {X_train.shape[0]:,}")
print(f"   Test samples: {X_test.shape[0]:,}")

# Train linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n3. Model Performance:")
print(f"   Mean Squared Error (MSE): {mse:,.2f}")
print(f"   Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"   Mean Absolute Error (MAE): {mae:,.2f}")
print(f"   R-squared (R²): {r2:.4f}")

print(f"\n4. Model Coefficients:")
for i, feat in enumerate(numerical_features):
    print(f"   {feat}: {lr_model.coef_[i]:.6f}")
print(f"   Intercept: {lr_model.intercept_:.2f}")

print(f"\n5. Interpretation:")
print(f"   • R² = {r2:.4f} means the model explains {r2*100:.1f}% of variance in claims")
print(f"   • RMSE = R{rmse:.2f} means average prediction error is R{rmse:.2f}")
if r2 < 0.1:
    print(f"   • WARNING: Very low R². Need more/better features.")

OBJECTIVE 1: LINEAR REGRESSION FOR TOTAL CLAIMS
Starting with numerical features only...

1. Correlation analysis:
   SumInsured           Correlation with Claims: -0.0063

2. Data split:
   Training samples: 494,540
   Test samples: 123,636

3. Model Performance:
   Mean Squared Error (MSE): 6,522,068.32
   Root Mean Squared Error (RMSE): 2,553.83
   Mean Absolute Error (MAE): 193.70
   R-squared (R²): 0.0001

4. Model Coefficients:
   SumInsured: -0.000012
   Intercept: 109.51

5. Interpretation:
   • R² = 0.0001 means the model explains 0.0% of variance in claims
   • RMSE = R2553.83 means average prediction error is R2553.83


In [8]:
# Cell 6: Adding Categorical Features with One-Hot Encoding
print("=" * 60)
print("ADDING CATEGORICAL FEATURES WITH ONE-HOT ENCODING")
print("=" * 60)

# Select categorical features to add
categorical_features = ['Province', 'VehicleType', 'CoverType']

# Create new feature set
all_features = numerical_features + categorical_features

print(f"Features for enhanced model: {all_features}")
print()

# Prepare data with one-hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate numerical and categorical columns
numerical_cols = numerical_features
categorical_cols = categorical_features

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

# Create pipeline with preprocessing and linear regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data
X = df_model[all_features]
y = df_model['TotalClaims']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
print("Training enhanced linear regression model...")
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nEnhanced Model Performance:")
print(f"   R-squared (R²): {r2:.4f} (explains {r2*100:.1f}% of variance)")
print(f"   RMSE: R{rmse:.2f}")
print(f"   MAE: R{mae:.2f}")

# Compare with simple model
print(f"\nComparison with simple model:")
print(f"   Simple model R²: {r2:.4f}")
print(f"   Enhanced model R²: {r2:.4f}")
print(f"   Improvement: {(r2 - r2):.4f}")

# Get feature importance (for linear model, coefficients)
try:
    # Extract feature names after one-hot encoding
    cat_encoder = pipeline.named_steps['preprocessor'].named_transformers_['cat']
    cat_feature_names = cat_encoder.get_feature_names_out(categorical_cols)
    all_feature_names = numerical_cols + list(cat_feature_names)
    
    # Get coefficients
    coefficients = pipeline.named_steps['regressor'].coef_
    
    print(f"\nTop 10 most important features (by absolute coefficient):")
    feat_importance = pd.DataFrame({
        'Feature': all_feature_names,
        'Coefficient': coefficients
    })
    feat_importance['Abs_Coefficient'] = np.abs(feat_importance['Coefficient'])
    feat_importance = feat_importance.sort_values('Abs_Coefficient', ascending=False).head(10)
    
    for _, row in feat_importance.iterrows():
        print(f"   {row['Feature'][:40]:40} Coef: {row['Coefficient']:8.2f}")
        
except Exception as e:
    print(f"Could not extract feature importance: {e}")

ADDING CATEGORICAL FEATURES WITH ONE-HOT ENCODING
Features for enhanced model: ['SumInsured', 'Province', 'VehicleType', 'CoverType']

Training enhanced linear regression model...

Enhanced Model Performance:
   R-squared (R²): 0.0059 (explains 0.6% of variance)
   RMSE: R2546.34
   MAE: R209.93

Comparison with simple model:
   Simple model R²: 0.0059
   Enhanced model R²: 0.0059
   Improvement: 0.0000

Top 10 most important features (by absolute coefficient):
   CoverType_Standalone passenger liability Coef: -8900.04
   CoverType_Passenger Liability            Coef: -8895.80
   CoverType_Third Party Only               Coef: -1828.63
   CoverType_Own Damage                     Coef:  1642.52
   VehicleType_nan                          Coef:  1460.32
   CoverType_Income Protector               Coef:  1219.76
   CoverType_Trailer                        Coef:  1196.95
   CoverType_Windscreen                     Coef:  1188.67
   CoverType_Roadside Assistance            Coef:  1166.06
   