In [50]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/insurance-risk-analysis-week3
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
from src import DataManager, DataPreparer, ClaimClassifier, ClaimSeverityRegressor
from scripts.constants import Columns, MODEL_TYPES
from sklearn.model_selection import train_test_split
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt

In [52]:
dm = DataManager()
clean_df = dm.load_csv(load_clean=True)

Basic Data Info:

<class 'pandas.core.frame.DataFrame'>
Index: 846034 entries, 0 to 1000097
Data columns (total 54 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   UnderwrittenCoverID          846034 non-null  int64         
 1   PolicyID                     846034 non-null  int64         
 2   TransactionMonth             846034 non-null  datetime64[ns]
 3   IsVATRegistered              846034 non-null  bool          
 4   Citizenship                  846034 non-null  object        
 5   LegalType                    846034 non-null  object        
 6   Title                        846034 non-null  object        
 7   Language                     846034 non-null  object        
 8   Bank                         846034 non-null  object        
 9   AccountType                  846034 non-null  object        
 10  MaritalStatus                846034 non-null  object        
 11  Gender      

In [53]:
data_preparer = DataPreparer()

### 1. Claim Severity Prediction

In [54]:
# Step 1 Data Preparation

X_lin, y_lin, preprocessor_lin, severity_df = (
    data_preparer.prepare_for_linear_regression(
        clean_df, target_col=Columns.TotalClaims.value
    )
)

Skewness of TotalClaims: 3.8463379940832976
Transformed TotalClaims to Log: TotalClaims_log


In [55]:
severity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2437 entries, 284 to 999701
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province                  2437 non-null   object 
 1   Gender                    2437 non-null   object 
 2   make                      2437 non-null   object 
 3   Model                     2437 non-null   object 
 4   RegistrationYear          2437 non-null   int64  
 5   Cylinders                 2437 non-null   float64
 6   cubiccapacity             2437 non-null   float64
 7   kilowatts                 2437 non-null   float64
 8   NumberOfDoors             2437 non-null   float64
 9   CustomValueEstimate       2437 non-null   float64
 10  SumInsured                2437 non-null   float64
 11  CalculatedPremiumPerTerm  2437 non-null   float64
 12  TotalPremium              2437 non-null   float64
 13  TotalClaims               2437 non-null   float64
 14  TotalClai

In [56]:
# Step 2: Train/Test Data Split

X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
    X_lin, y_lin, test_size=0.3, random_state=42
)

A. Linear Regression Training

In [None]:
# Step 3: Prepare data for LR and initiate Model
regressor_lr = ClaimSeverityRegressor(model_type=MODEL_TYPES.LINEAR_REGRESSION.value)

# Step 4: Train & Evaluate
regressor_lr.train(X_train_l, y_train_l, preprocessor_lin, log_transformed=True)
metrics_lr, preds_lr = regressor_lr.evaluate(X_test_l, y_test_l)



Training linear Regressor...
Training Complete.

Model Performance (linear):
RMSE: 52511.6508
MAE: 17229.9061
R2: -0.9264
SHAP explanation skipped for Linear Model (Focus on Coefficients instead).




B. Tree Based (Random Forest / XGBoost)

XGBoost

In [58]:
# Step 1: Data Preparation

X_tree, y_tree, preprocessor_tree = data_preparer.prepare_for_tree_model(
    clean_df, target_col=Columns.TotalClaims.value
)

In [59]:
# Step 2: Train/Test Data Split
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(
    X_tree, y_tree, test_size=0.3, random_state=42
)

In [60]:
# Step 3: Train & Evaluate
regressor_xgb = ClaimSeverityRegressor(model_type=MODEL_TYPES.XGBOOST.value)
regressor_xgb.train(X_train_tree, y_train_tree, preprocessor_tree, log_transformed=False)
metrics_xgb, preds_xgb = regressor_xgb.evaluate(X_test_tree, y_test_tree)

Training xgboost Regressor...
Training Complete.

Model Performance (xgboost):
RMSE: 34238.4774
MAE: 15744.3568
R2: 0.1811




Random Forest

In [61]:
# Step 1: Train & Evaluate (Use same train test split as xgboost)
regressor_rf = ClaimSeverityRegressor(model_type=MODEL_TYPES.RANDOM_FOREST.value)
regressor_rf.train(
    X_train_tree, y_train_tree, preprocessor_tree, log_transformed=False
)
metrics_rf, preds_rf = regressor_rf.evaluate(X_test_tree, y_test_tree)


Training random_forest Regressor...
Training Complete.

Model Performance (random_forest):
RMSE: 31571.0008
MAE: 15055.2488
R2: 0.3037




### 2. Claim Probability Prediction

In [62]:
# Step 1: Data Preparation

X_class, y_class, preprocessor_class = data_preparer.prepare_for_classification(
    clean_df, classifier_col=Columns.TotalClaims.value
)

In [63]:
# 2. Split Data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.3, random_state=42, stratify=y_class
)


A. Random Forest

In [104]:
classifier_rf = ClaimClassifier(model_type=MODEL_TYPES.RANDOM_FOREST.value)
classifier_rf.train(X_train_class, y_train_class, preprocessor_class)
classifier_metrics_rf, y_proba_rf = classifier_rf.evaluate(X_test_class, y_test_class)


Training random_forest Classifier...
Training Complete.





Model Performance:
Accuracy: 0.9270
Precision: 0.0160
Recall: 0.4022
F1 Score: 0.0308
ROC-AUC: 0.6852


XGBOOST

In [78]:
classifier_xg = ClaimClassifier(model_type=MODEL_TYPES.XGBOOST.value)
classifier_xg.train(X_train_class, y_train_class, preprocessor_class)
classifier_metrics_xg, y_proba_xg = classifier_xg.evaluate(X_test_class, y_test_class)

Training xgboost Classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Complete.





Model Performance:
Accuracy: 0.9613
Precision: 0.0270
Recall: 0.3543
F1 Score: 0.0501
ROC-AUC: 0.9242


### Selecting Best Models



In [79]:
claim_severity_models_metrics_list = [metrics_lr, metrics_xgb, metrics_rf]
claim_probability_models_metrics_list = [classifier_metrics_rf, classifier_metrics_xg]

claim_severity_metrics_df = pd.DataFrame(claim_severity_models_metrics_list)
claim_severity_metrics_df["Model Type"] = [
    MODEL_TYPES.LINEAR_REGRESSION.value,
    MODEL_TYPES.XGBOOST.value,
    MODEL_TYPES.RANDOM_FOREST.value,
]

claim_probability_metrics_df = pd.DataFrame(claim_probability_models_metrics_list)
claim_probability_metrics_df["Model Type"] = [
    MODEL_TYPES.XGBOOST.value,
    MODEL_TYPES.RANDOM_FOREST.value,
]

print("Claim Severity Models Score:")
print(tabulate(claim_severity_metrics_df, headers="keys", tablefmt="grid"))

print("\nClaim Probability Models Score:")
print(tabulate(claim_probability_metrics_df, headers="keys", tablefmt="grid"))

Claim Severity Models Score:
+----+---------+---------+-----------+---------------+
|    |    RMSE |     MAE |        R2 | Model Type    |
|  0 | 52511.7 | 17229.9 | -0.926357 | linear        |
+----+---------+---------+-----------+---------------+
|  1 | 34238.5 | 15744.4 |  0.181056 | xgboost       |
+----+---------+---------+-----------+---------------+
|  2 | 31571   | 15055.2 |  0.303691 | random_forest |
+----+---------+---------+-----------+---------------+

Claim Probability Models Score:
+----+------------+-------------+----------+------------+-----------+---------------+
|    |   Accuracy |   Precision |   Recall |   F1 Score |   ROC-AUC | Model Type    |
|  0 |   0.92704  |   0.016     | 0.402189 |  0.0307757 |  0.685245 | xgboost       |
+----+------------+-------------+----------+------------+-----------+---------------+
|  1 |   0.961326 |   0.0269707 | 0.354309 |  0.0501258 |  0.924207 | random_forest |
+----+------------+-------------+----------+------------+-----------

### Model Comparision 

1. Claim Severity Models (Regression)

Linear regression performed very poorly (negative R²). That means a straight-line model is worse than predicting the average claim amount (target is strongly skewed and relationships are nonlinear)

Both tree ensembles (XGBoost, Random Forest) are far better. Random Forest has the lowest RMSE and highest R² (≈0.30). Practically, RF gives the smallest average errors and explains the most variance among the three.

Random Forest is the best candidate among these three.



2. Claim Probability Models (Classification)

Accuracy is meaningless here because positives are extremely rare (only ~0.24% of rows have a Total Claimed value > 0)


XGBoost has slightly higher recall (0.402) whole Random Forest has higher precision and much higher ROC-AUC (0.924 vs 0.685). AUC = 0.924 means RF ranks customers by risk very well; XGBoost’s low AUC (0.685) indicates poor ranking even though its recall is high under 0.02 threshold.

F1 is small for both, but RF’s F1 is higher than XGBoost’s (0.05 vs 0.031), indicating a better precision/recall balance for RF at the used threshold.

### Decision

Claim severity: Random Forest (best RMSE, best R² among tested)

Claim probability: Random Forest (best ROC-AUC, better F1/precision balance)

### Influential Features

In [90]:
# Claim Probability Prediction Features

claim_probability_feats_rf = classifier_rf.get_influential_features()
print(tabulate(claim_probability_feats_rf.head(15), headers="keys", tablefmt="grid"))

claim_probability_feats_xg = classifier_xg.get_influential_features()
print(tabulate(claim_probability_feats_xg.head(15), headers="keys", tablefmt="grid"))


Top Influential Features (random_forest):
+-----+---------------------------------+--------------+
|     | Feature                         |   Importance |
|   8 | TotalPremium                    |   0.328387   |
+-----+---------------------------------+--------------+
|   7 | CalculatedPremiumPerTerm        |   0.282725   |
+-----+---------------------------------+--------------+
|   6 | SumInsured                      |   0.159243   |
+-----+---------------------------------+--------------+
|   0 | RegistrationYear                |   0.0572668  |
+-----+---------------------------------+--------------+
|   5 | CustomValueEstimate             |   0.0543761  |
+-----+---------------------------------+--------------+
|   2 | cubiccapacity                   |   0.0100074  |
+-----+---------------------------------+--------------+
|   3 | kilowatts                       |   0.00899873 |
+-----+---------------------------------+--------------+
|  10 | Province_Gauteng                |   0

In [94]:
# Claim Severity Prediction Influential Features

claim_severity_feats_lr = regressor_lr.get_influential_features()
print(tabulate(claim_severity_feats_lr.head(15), headers="keys", tablefmt="grid"))

claim_severity_feats_xg = regressor_xgb.get_influential_features()
print(tabulate(claim_severity_feats_xg.head(15), headers="keys", tablefmt="grid"))

claim_severity_feats_rf = regressor_rf.get_influential_features()
print(tabulate(claim_severity_feats_rf.head(15), headers="keys", tablefmt="grid"))



Top Influential Features (linear Regressor):
+-----+----------------------------------------+------------------+
|     | Feature                                |   Coef Magnitude |
|  62 | cat__Model_C200 BE CLASSIC A/T         |          3.23107 |
+-----+----------------------------------------+------------------+
|  84 | cat__Model_DUCATO 2.3 JTD F/C C/C      |          3.15437 |
+-----+----------------------------------------+------------------+
|  87 | cat__Model_E 200K                      |          2.75918 |
+-----+----------------------------------------+------------------+
| 122 | cat__Model_SPRINTER 308 CDI F/C P/V    |          2.30601 |
+-----+----------------------------------------+------------------+
|  94 | cat__Model_HiACE 2200 F/C P/V          |          1.92129 |
+-----+----------------------------------------+------------------+
|  70 | cat__Model_COROLLA 1.3 IMPACT          |          1.82677 |
+-----+----------------------------------------+------------------+
| 