In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, classification_report, roc_auc_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns
shap.initjs()

In [None]:
# Load and prepare data
df = pd.read_csv('../data/processed/cleaned.csv')
df['HasClaim'] = df['TotalClaims'] > 0
df['VehicleAge'] = 2025 - df['RegistrationYear']
df = df.dropna(subset=['TotalPremium', 'CalculatedPremiumPerTerm', 'TotalClaims'])
features = ['VehicleAge', 'SumInsured']

In [None]:
# Claim Severity Prediction (Regression)
severity_df = df[df['HasClaim']].copy()
X_s = severity_df[features]
y_s = severity_df['TotalClaims']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y_s, test_size=0.2, random_state=42)
model_s = XGBRegressor(random_state=42)
model_s.fit(X_train_s, y_train_s)
y_pred_s = model_s.predict(X_test_s)
rmse = np.sqrt(mean_squared_error(y_test_s, y_pred_s))
r2 = r2_score(y_test_s, y_pred_s)
print(f"Claim Severity Model -> RMSE: {rmse:.2f}, R²: {r2:.2f}")

In [None]:
# Claim Probability Classification
X_c = df[features]
y_c = df['HasClaim']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
model_c = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_c.fit(X_train_c, y_train_c)
y_pred_c = model_c.predict(X_test_c)
print("Classification Report:\n", classification_report(y_test_c, y_pred_c))
print("ROC AUC:", roc_auc_score(y_test_c, model_c.predict_proba(X_test_c)[:, 1]))

In [None]:
# Risk-Based Premium Optimization
prob_claim = model_c.predict_proba(X_test_c)[:, 1]
severity_pred = model_s.predict(X_test_c)
premium = prob_claim * severity_pred + 1000 + 0.2 * severity_pred
df_result = X_test_c.copy()
df_result['PredictedPremium'] = premium
df_result['ActualPremium'] = df.loc[X_test_c.index, 'CalculatedPremiumPerTerm'].values
sns.histplot(df_result[['PredictedPremium', 'ActualPremium']], kde=True)
plt.title("Predicted vs Actual Premium Distribution")
plt.show()

In [None]:
# SHAP Interpretation
explainer_s = shap.Explainer(model_s, X_test_s)
shap_values_s = explainer_s(X_test_s)
shap.summary_plot(shap_values_s, X_test_s)

explainer_c = shap.Explainer(model_c, X_test_c)
shap_values_c = explainer_c(X_test_c)
shap.summary_plot(shap_values_c, X_test_c)