# Statistical Modeling: Claim Severity and Premium Prediction

This notebook builds predictive models for:
1. **Claim Severity Prediction**: Predict TotalClaims for policies with claims > 0
2. **Premium Optimization**: Predict optimal premium values

## Models to Implement
- Linear Regression
- Decision Trees
- Random Forests
- XGBoost

## Evaluation Metrics
- RMSE (Root Mean Squared Error)
- R² (Coefficient of Determination)
- MAE (Mean Absolute Error)
- MAPE (Mean Absolute Percentage Error)

## Model Interpretability
- Feature Importance Analysis
- SHAP (SHapley Additive exPlanations)
- LIME (Local Interpretable Model-agnostic Explanations)


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings

warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path().resolve().parent))

from src.data.load_data import load_insurance_data
from src.utils.config import REPORTS_DIR, MODELS_DIR
from src.modeling.data_preparation import (
    prepare_claim_severity_data,
    prepare_premium_prediction_data
)
from src.modeling.models import (
    train_linear_regression,
    train_decision_tree,
    train_random_forest,
    train_xgboost,
    evaluate_model,
    compare_models
)
from src.modeling.interpretability import (
    get_feature_importance,
    plot_feature_importance,
    explain_with_shap,
    plot_shap_summary
)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

# Create directories
FIGURES_DIR = REPORTS_DIR / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# Load data
df = load_insurance_data()
print(f"Dataset loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"\nPolicies with claims: {len(df[df['TotalClaims'] > 0]):,}")
print(f"Policies without claims: {len(df[df['TotalClaims'] == 0]):,}")


## Model 1: Claim Severity Prediction

Predict TotalClaims for policies that have claims > 0.


In [None]:
# Prepare data for claim severity prediction
X_train_sev, X_test_sev, y_train_sev, y_test_sev, feature_names_sev, preprocessor_sev = \
    prepare_claim_severity_data(df, target_col='TotalClaims', test_size=0.2)

print(f"Training set: {X_train_sev.shape[0]:,} samples, {X_train_sev.shape[1]} features")
print(f"Test set: {X_test_sev.shape[0]:,} samples")
print(f"\nTarget (TotalClaims) statistics:")
print(f"  Mean: {y_train_sev.mean():.2f} ZAR")
print(f"  Median: {y_train_sev.median():.2f} ZAR")
print(f"  Std: {y_train_sev.std():.2f} ZAR")


In [None]:
# Train all models for claim severity prediction
models_sev = {}

print("Training models for Claim Severity Prediction...")
print("="*80)

# Linear Regression
print("\n1. Training Linear Regression...")
model_lr, train_metrics_lr = train_linear_regression(X_train_sev, y_train_sev)
models_sev['Linear Regression'] = (model_lr, train_metrics_lr)
test_metrics_lr = evaluate_model(model_lr, X_test_sev, y_test_sev)
print(f"   Test RMSE: {test_metrics_lr['rmse']:.2f}, Test R²: {test_metrics_lr['r2']:.4f}")

# Decision Tree
print("\n2. Training Decision Tree...")
model_dt, train_metrics_dt = train_decision_tree(X_train_sev, y_train_sev, max_depth=10)
models_sev['Decision Tree'] = (model_dt, train_metrics_dt)
test_metrics_dt = evaluate_model(model_dt, X_test_sev, y_test_sev)
print(f"   Test RMSE: {test_metrics_dt['rmse']:.2f}, Test R²: {test_metrics_dt['r2']:.4f}")

# Random Forest
print("\n3. Training Random Forest...")
model_rf, train_metrics_rf = train_random_forest(X_train_sev, y_train_sev, n_estimators=100)
models_sev['Random Forest'] = (model_rf, train_metrics_rf)
test_metrics_rf = evaluate_model(model_rf, X_test_sev, y_test_sev)
print(f"   Test RMSE: {test_metrics_rf['rmse']:.2f}, Test R²: {test_metrics_rf['r2']:.4f}")

# XGBoost
print("\n4. Training XGBoost...")
model_xgb, train_metrics_xgb = train_xgboost(X_train_sev, y_train_sev, n_estimators=100)
models_sev['XGBoost'] = (model_xgb, train_metrics_xgb)
test_metrics_xgb = evaluate_model(model_xgb, X_test_sev, y_test_sev)
print(f"   Test RMSE: {test_metrics_xgb['rmse']:.2f}, Test R²: {test_metrics_xgb['r2']:.4f}")

print("\n" + "="*80)
print("All models trained successfully!")


In [None]:
# Compare all models
comparison_sev = compare_models(models_sev, X_test_sev, y_test_sev)
print("\nModel Comparison - Claim Severity Prediction:")
print("="*80)
print(comparison_sev.to_string(index=False))

# Identify best model
best_model_sev = comparison_sev.loc[comparison_sev['Test_R2'].idxmax(), 'Model']
print(f"\nBest Model (by R²): {best_model_sev}")
print(f"  Test R²: {comparison_sev.loc[comparison_sev['Test_R2'].idxmax(), 'Test_R2']:.4f}")
print(f"  Test RMSE: {comparison_sev.loc[comparison_sev['Test_R2'].idxmax(), 'Test_RMSE']:.2f} ZAR")


## Model 2: Premium Prediction

Predict optimal premium values.


In [None]:
# Prepare data for premium prediction
X_train_prem, X_test_prem, y_train_prem, y_test_prem, feature_names_prem, preprocessor_prem = \
    prepare_premium_prediction_data(df, target_col='TotalPremium', test_size=0.2)

print(f"Training set: {X_train_prem.shape[0]:,} samples, {X_train_prem.shape[1]} features")
print(f"Test set: {X_test_prem.shape[0]:,} samples")
print(f"\nTarget (TotalPremium) statistics:")
print(f"  Mean: {y_train_prem.mean():.2f} ZAR")
print(f"  Median: {y_train_prem.median():.2f} ZAR")
print(f"  Std: {y_train_prem.std():.2f} ZAR")


In [None]:
# Train all models for premium prediction
models_prem = {}

print("Training models for Premium Prediction...")
print("="*80)

# Linear Regression
print("\n1. Training Linear Regression...")
model_lr_prem, train_metrics_lr_prem = train_linear_regression(X_train_prem, y_train_prem)
models_prem['Linear Regression'] = (model_lr_prem, train_metrics_lr_prem)
test_metrics_lr_prem = evaluate_model(model_lr_prem, X_test_prem, y_test_prem)
print(f"   Test RMSE: {test_metrics_lr_prem['rmse']:.2f}, Test R²: {test_metrics_lr_prem['r2']:.4f}")

# Decision Tree
print("\n2. Training Decision Tree...")
model_dt_prem, train_metrics_dt_prem = train_decision_tree(X_train_prem, y_train_prem, max_depth=10)
models_prem['Decision Tree'] = (model_dt_prem, train_metrics_dt_prem)
test_metrics_dt_prem = evaluate_model(model_dt_prem, X_test_prem, y_test_prem)
print(f"   Test RMSE: {test_metrics_dt_prem['rmse']:.2f}, Test R²: {test_metrics_dt_prem['r2']:.4f}")

# Random Forest
print("\n3. Training Random Forest...")
model_rf_prem, train_metrics_rf_prem = train_random_forest(X_train_prem, y_train_prem, n_estimators=100)
models_prem['Random Forest'] = (model_rf_prem, train_metrics_rf_prem)
test_metrics_rf_prem = evaluate_model(model_rf_prem, X_test_prem, y_test_prem)
print(f"   Test RMSE: {test_metrics_rf_prem['rmse']:.2f}, Test R²: {test_metrics_rf_prem['r2']:.4f}")

# XGBoost
print("\n4. Training XGBoost...")
model_xgb_prem, train_metrics_xgb_prem = train_xgboost(X_train_prem, y_train_prem, n_estimators=100)
models_prem['XGBoost'] = (model_xgb_prem, train_metrics_xgb_prem)
test_metrics_xgb_prem = evaluate_model(model_xgb_prem, X_test_prem, y_test_prem)
print(f"   Test RMSE: {test_metrics_xgb_prem['rmse']:.2f}, Test R²: {test_metrics_xgb_prem['r2']:.4f}")

print("\n" + "="*80)
print("All models trained successfully!")


In [None]:
# Compare all models for premium prediction
comparison_prem = compare_models(models_prem, X_test_prem, y_test_prem)
print("\nModel Comparison - Premium Prediction:")
print("="*80)
print(comparison_prem.to_string(index=False))

# Identify best model
best_model_prem = comparison_prem.loc[comparison_prem['Test_R2'].idxmax(), 'Model']
print(f"\nBest Model (by R²): {best_model_prem}")
print(f"  Test R²: {comparison_prem.loc[comparison_prem['Test_R2'].idxmax(), 'Test_R2']:.4f}")
print(f"  Test RMSE: {comparison_prem.loc[comparison_prem['Test_R2'].idxmax(), 'Test_RMSE']:.2f} ZAR")


## Model Interpretability: Feature Importance Analysis

Analyze which features are most influential in predictions.


In [None]:
# Get feature importance for best claim severity model
best_sev_model_name = best_model_sev
best_sev_model = models_sev[best_sev_model_name][0]

importance_sev = get_feature_importance(best_sev_model, feature_names_sev)
print(f"\nTop 10 Features for {best_sev_model_name} (Claim Severity):")
print("="*80)
print(importance_sev.head(10).to_string(index=False))

# Plot feature importance
plot_feature_importance(
    importance_sev,
    top_n=10,
    title=f"Top 10 Feature Importance - {best_sev_model_name} (Claim Severity)",
    save_path=FIGURES_DIR / '10_feature_importance_claim_severity.png'
)


In [None]:
# Get feature importance for best premium prediction model
best_prem_model_name = best_model_prem
best_prem_model = models_prem[best_prem_model_name][0]

importance_prem = get_feature_importance(best_prem_model, feature_names_prem)
print(f"\nTop 10 Features for {best_prem_model_name} (Premium Prediction):")
print("="*80)
print(importance_prem.head(10).to_string(index=False))

# Plot feature importance
plot_feature_importance(
    importance_prem,
    top_n=10,
    title=f"Top 10 Feature Importance - {best_prem_model_name} (Premium Prediction)",
    save_path=FIGURES_DIR / '11_feature_importance_premium.png'
)


## SHAP Analysis (for Best Model)

Use SHAP to understand how individual features influence predictions.


In [None]:
# SHAP analysis for best claim severity model
print(f"Generating SHAP explanations for {best_sev_model_name} (Claim Severity)...")
shap_result_sev = explain_with_shap(
    best_sev_model,
    X_test_sev[:100],  # Use sample for faster computation
    feature_names_sev,
    max_evals=100
)

if shap_result_sev:
    plot_shap_summary(
        shap_result_sev,
        save_path=FIGURES_DIR / '12_shap_summary_claim_severity.png'
    )
    print("\nSHAP analysis completed!")
else:
    print("SHAP analysis not available. Install SHAP: pip install shap")


## Summary and Business Recommendations

Based on model performance and feature importance analysis.


In [None]:
print("\n" + "="*80)
print("MODELING SUMMARY")
print("="*80)

print("\n1. CLAIM SEVERITY PREDICTION:")
print(f"   Best Model: {best_sev_model_name}")
print(f"   Test R²: {comparison_sev.loc[comparison_sev['Model'] == best_sev_model_name, 'Test_R2'].values[0]:.4f}")
print(f"   Test RMSE: {comparison_sev.loc[comparison_sev['Model'] == best_sev_model_name, 'Test_RMSE'].values[0]:.2f} ZAR")
print(f"\n   Top 5 Features:")
for idx, row in importance_sev.head(5).iterrows():
    print(f"     - {row['feature']}: {row['importance']:.4f}")

print("\n2. PREMIUM PREDICTION:")
print(f"   Best Model: {best_prem_model_name}")
print(f"   Test R²: {comparison_prem.loc[comparison_prem['Model'] == best_prem_model_name, 'Test_R2'].values[0]:.4f}")
print(f"   Test RMSE: {comparison_prem.loc[comparison_prem['Model'] == best_prem_model_name, 'Test_RMSE'].values[0]:.2f} ZAR")
print(f"\n   Top 5 Features:")
for idx, row in importance_prem.head(5).iterrows():
    print(f"     - {row['feature']}: {row['importance']:.4f}")

print("\n" + "="*80)
print("Business Recommendations will be documented in the final report.")
print("="*80)
