In [15]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/insurance-risk-analysis-week3
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
from src import DataManager, DataPreparer, ClaimClassifier, ClaimSeverityRegressor
from scripts.constants import Columns
from sklearn.model_selection import train_test_split

In [17]:
dm = DataManager()
clean_df = dm.load_csv(load_clean=True)

Basic Data Info:

<class 'pandas.core.frame.DataFrame'>
Index: 846034 entries, 0 to 1000097
Data columns (total 54 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   UnderwrittenCoverID          846034 non-null  int64         
 1   PolicyID                     846034 non-null  int64         
 2   TransactionMonth             846034 non-null  datetime64[ns]
 3   IsVATRegistered              846034 non-null  bool          
 4   Citizenship                  846034 non-null  object        
 5   LegalType                    846034 non-null  object        
 6   Title                        846034 non-null  object        
 7   Language                     846034 non-null  object        
 8   Bank                         846034 non-null  object        
 9   AccountType                  846034 non-null  object        
 10  MaritalStatus                846034 non-null  object        
 11  Gender      

In [18]:
data_preparer = DataPreparer()

### 1. Claim Severity Prediction

In [19]:
# Step 1 Data Preparation

X_lin, y_lin, preprocessor_lin, severity_df = (
    data_preparer.prepare_for_linear_regression(
        clean_df, target_col=Columns.TotalClaims.value
    )
)

Skewness of TotalClaims: 3.8463379940832976
Transformed TotalClaims to Log: TotalClaims_log


In [20]:
severity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2437 entries, 284 to 999701
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Province                  2437 non-null   object 
 1   Gender                    2437 non-null   object 
 2   make                      2437 non-null   object 
 3   Model                     2437 non-null   object 
 4   RegistrationYear          2437 non-null   int64  
 5   Cylinders                 2437 non-null   float64
 6   cubiccapacity             2437 non-null   float64
 7   kilowatts                 2437 non-null   float64
 8   NumberOfDoors             2437 non-null   float64
 9   CustomValueEstimate       2437 non-null   float64
 10  SumInsured                2437 non-null   float64
 11  CalculatedPremiumPerTerm  2437 non-null   float64
 12  TotalPremium              2437 non-null   float64
 13  TotalClaims               2437 non-null   float64
 14  TotalClai

In [21]:
# Step 2: Train/Test Data Split

X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
    X_lin, y_lin, test_size=0.3, random_state=42
)

A. Linear Regression Training

In [22]:
# Step 3: Prepare data for LR and initiate Model
regressor_lr = ClaimSeverityRegressor(model_type="linear")

# Step 4: Train & Evaluate
regressor_lr.train(X_train_l, y_train_l, preprocessor_lin, log_transformed=True)
metrics_lr, preds_lr = regressor_lr.evaluate(X_test_l, y_test_l)

Training linear Regressor...
Training Complete.

Model Performance (linear):
RMSE: 52511.6508
MAE: 17229.9061
R2: -0.9264




B. Tree Based (Random Forest / XGBoost)

XGBoost

In [23]:
# Step 1: Data Preparation

X_tree, y_tree, preprocessor_tree = data_preparer.prepare_for_tree_model(
    clean_df, target_col=Columns.TotalClaims.value
)

In [24]:
# Step 2: Train/Test Data Split
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(
    X_tree, y_tree, test_size=0.3, random_state=42
)

In [25]:
# Step 3: Train & Evaluate
regressor_xgb = ClaimSeverityRegressor(model_type="xgboost")
regressor_xgb.train(X_train_tree, y_train_tree, preprocessor_tree, log_transformed=False)
metrics_xgb, preds_xgb = regressor_xgb.evaluate(X_test_tree, y_test_tree)



Training xgboost Regressor...
Training Complete.

Model Performance (xgboost):
RMSE: 34238.4774
MAE: 15744.3568
R2: 0.1811




### 2. Claim Probability Prediction

In [26]:
X, y, preprocessor = data_preparer.prepare_for_classification(
    clean_df, target_col=Columns.TotalClaims.value
)

# 2. Split Data 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 3. Train & Evaluate
classifier = ClaimClassifier()
classifier.train(X_train, y_train, preprocessor)
metrics = classifier.evaluate(X_test, y_test)


Training Random Forest Classifier...
Training Complete.





Model Performance:
Accuracy: 0.9402
Precision: 0.0177
Recall: 0.3625
F1 Score: 0.0337
ROC-AUC: 0.6814
