In [12]:
# cell 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
# import xgboost as xgb
# import shap
import joblib

sns.set(style='whitegrid')

# reuse to_numeric_safe from previous notebook (or re-define)
def to_numeric_safe(series):
    s = series.astype(str).str.strip()
    s = s.replace(['nan','NaN','None','NONE','none',''], np.nan)
    s = s.str.replace(".", "", regex=False)
    s = s.str.replace(",", ".", regex=False)
    s = s.str.replace(" ", "", regex=False)
    return pd.to_numeric(s, errors='coerce')


In [8]:
# cell 2
df = pd.read_parquet("../data/processed/insurance_data.parquet")
# detect claim column as before
claim_cols = [c for c in df.columns if 'claim' in c.lower()]
if 'TotalClaims' in df.columns:
    claim_col = 'TotalClaims'
elif claim_cols:
    claim_col = claim_cols[0]
else:
    raise RuntimeError("No claim column found. Add TotalClaims or similar.")

# clean numeric columns
df['TotalPremium'] = to_numeric_safe(df['TotalPremium'])
df[claim_col] = to_numeric_safe(df[claim_col])
df['HasClaim'] = df[claim_col].fillna(0) > 0
df['ClaimSeverity'] = df.loc[df['HasClaim'], claim_col]
df['Margin'] = df['TotalPremium'] - df[claim_col].fillna(0)

# Feature engineering examples
df['VehicleAge'] = df['TransactionMonth'].dt.year - df['RegistrationYear']
df['IsNewVehicle'] = df['NewVehicle'].fillna('No').map(lambda x: 1 if str(x).lower() in ['yes','y','true','1'] else 0)

# choose features (example subset)
feature_cols = [
    'VehicleType', 'make', 'Model', 'Province', 'Gender',
    'VehicleAge', 'IsNewVehicle', 'Kilometers'  # replace Kilometers with an actual column if present
]
# filter only features present in df
feature_cols = [c for c in feature_cols if c in df.columns]
print("Using features:", feature_cols)


Using features: ['VehicleType', 'make', 'Model', 'Province', 'Gender', 'VehicleAge', 'IsNewVehicle']


In [14]:
# cell 3
# separate numeric & categorical
num_feats = [c for c in feature_cols if df[c].dtype.kind in 'biufc']
cat_feats = [c for c in feature_cols if c not in num_feats]

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_feats),
    ('cat', cat_pipe, cat_feats)
])


In [15]:
# cell 4
X = df[feature_cols].copy()
y = df['HasClaim'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42))
])

clf_pipe.fit(X_train, y_train)
y_pred = clf_pipe.predict(X_test)
y_proba = clf_pipe.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))


Accuracy: 0.9972102789721028
ROC AUC: 0.6053279962885108
Precision: 0.0
Recall: 0.0
F1: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# cell 5
df_claims = df[df['HasClaim']].copy()
Xc = df_claims[feature_cols]
yc = df_claims['ClaimSeverity'].fillna(0)

Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42)

reg_pipe = Pipeline([
    ('prep', preprocessor),
    ('reg', RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42))
])

reg_pipe.fit(Xc_train, yc_train)
yc_pred = reg_pipe.predict(Xc_test)

rmse = mean_squared_error(yc_test, yc_pred, squared=False)
r2 = r2_score(yc_test, yc_pred)
print("Severity RMSE:", rmse)
print("Severity R2:", r2)


In [6]:
# cell 6
X_p = df[feature_cols]
y_p = df['TotalPremium']

Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_p, y_p, test_size=0.2, random_state=42)

premium_pipe = Pipeline([
    ('prep', preprocessor),
    ('reg', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, n_jobs=-1, random_state=42))
])

premium_pipe.fit(Xp_train, yp_train)
yp_pred = premium_pipe.predict(Xp_test)

print("Premium RMSE:", mean_squared_error(yp_test, yp_pred, squared=False))
print("Premium R2:", r2_score(yp_test, yp_pred))


In [7]:
# cell 7
# You can wrap the training blocks in functions and run different models (LinearRegression, RF, XGBoost)
# Save best models
joblib.dump(clf_pipe, "models/claim_prob_rf.pkl")
joblib.dump(reg_pipe, "models/claim_severity_rf.pkl")
joblib.dump(premium_pipe, "models/premium_xgb.pkl")


Province - frequency: REJECT H0 (p=5.926e-19) — statistically significant difference.
Province - severity: FAIL TO REJECT H0 (p=0.05348) — no evidence of difference.
TopZIPs - frequency: REJECT H0 (p=2.603e-14) — statistically significant difference.
Gender - frequency: REJECT H0 (p=0.02657) — statistically significant difference.


In [None]:
# cell 8
# Example for claim severity model (random forest). We need the transformed feature names.
prep = reg_pipe.named_steps['prep']
X_sample = Xc_train.sample(200, random_state=1)
X_trans = prep.transform(X_sample)
# get feature names from column transformer
num_names = num_feats
cat_names = list(prep.named_transformers_['cat'].named_steps['ohe'].get_feature_names_out(cat_feats)) if cat_feats else []
feature_names = num_names + cat_names

explainer = shap.TreeExplainer(reg_pipe.named_steps['reg'])
shap_values = explainer.shap_values(X_trans)
shap.summary_plot(shap_values, X_trans, feature_names=feature_names, show=True)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=['No Claim', 'Has Claim'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix — Claim Occurrence")
plt.show()

In [None]:
# %% [code]
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve — Claim Occurrence")
plt.show()

In [None]:
# %% [code]
from sklearn.metrics import PrecisionRecallDisplay

PrecisionRecallDisplay.from_predictions(y_test, y_proba)
plt.title("Precision–Recall Curve — Claim Occurrence")
plt.show()


In [None]:
# %% [code]
residuals = yc_test - yc_pred

plt.figure(figsize=(6,4))
sns.scatterplot(x=yc_pred, y=residuals, alpha=0.3)
plt.axhline(0, color='red')
plt.xlabel("Predicted Severity")
plt.ylabel("Residuals")
plt.title("Residual Plot — Claim Severity")
plt.show()


In [None]:
# %% [code]
plt.figure(figsize=(6,6))
sns.scatterplot(x=yc_test, y=yc_pred, alpha=0.3)
plt.plot([yc_test.min(), yc_test.max()], [yc_test.min(), yc_test.max()], color='red')
plt.xlabel("Actual Severity")
plt.ylabel("Predicted Severity")
plt.title("Actual vs Predicted — Claim Severity")
plt.show()


In [None]:
# %% [code]
residuals_p = yp_test - yp_pred

plt.figure(figsize=(6,4))
sns.scatterplot(x=yp_pred, y=residuals_p, alpha=0.3)
plt.axhline(0, color='red')
plt.xlabel("Predicted Premium")
plt.ylabel("Residuals")
plt.title("Residual Plot — Premium Prediction")
plt.show()


In [None]:
# %% [code]
plt.figure(figsize=(6,6))
sns.scatterplot(x=yp_test, y=yp_pred, alpha=0.3)
plt.plot([yp_test.min(), yp_test.max()], [yp_test.min(), yp_test.max()], color='red')
plt.xlabel("Actual Premium")
plt.ylabel("Predicted Premium")
plt.title("Actual vs Predicted — Premium Regression")
plt.show()


In [None]:
model = clf_pipe.named_steps['clf']
importances = model.feature_importances_

# This is the ONLY correct way
prep = clf_pipe.named_steps["prep"]
feature_names = prep.get_feature_names_out()

print("Model feature count:", len(importances))
print("Name count:", len(feature_names))

# Build importance series
fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
sns.barplot(x=fi.values, y=fi.index)
plt.title("Top 20 Feature Importances — Claim Classification")
plt.show()
