In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt


data = pd.read_csv('../data/clean_data.csv')  
current_year = 2025
data['VehicleAge'] = current_year - data['RegistrationYear']
data['has_claim'] = (data['TotalClaims'] > 0).astype(int)


categorical_cols = ['VehicleType', 'Province', 'Gender']  
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
data_encoded.fillna(data_encoded.median(), inplace=True)

severity_data = data_encoded[data_encoded['TotalClaims'] > 0].copy()
X_severity = severity_data.drop(columns=['TotalClaims', 'has_claim'])
y_severity = severity_data['TotalClaims']

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_severity, y_severity, test_size=0.3, random_state=42
)


lr = LinearRegression()
lr.fit(X_train_s, y_train_s)
y_pred_lr = lr.predict(X_test_s)

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_s, y_train_s)
y_pred_rf = rf.predict(X_test_s)

xgb_model = xgb.XGBRegressor(n_estimators=200, random_state=42)
xgb_model.fit(X_train_s, y_train_s)
y_pred_xgb = xgb_model.predict(X_test_s)

def evaluate_model(y_true, y_pred, model_name):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} RMSE: {rmse:.4f}, R2: {r2:.4f}")

evaluate_model(y_test_s, y_pred_lr, "Linear Regression")
evaluate_model(y_test_s, y_pred_rf, "Random Forest")
evaluate_model(y_test_s, y_pred_xgb, "XGBoost")


X_class = data_encoded.drop(columns=['TotalClaims', 'has_claim'])
y_class = data_encoded['has_claim']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, random_state=42)
clf.fit(X_train_c, y_train_c)

pred_prob = clf.predict_proba(X_test_c)[:,1]

predicted_claim_severity = rf.predict(X_test_c)  
risk_based_premium = pred_prob * predicted_claim_severity
print(risk_based_premium[:10])


In [None]:
import shap
explainer = shap.Explainer(rf, X_train_s)
shap_values = explainer(X_test_s)
shap.summary_plot(shap_values, X_test_s)