In [1]:
# -----------------------------
# 1. Imports
# -----------------------------
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import shap
import matplotlib.pyplot as plt

In [2]:
# -----------------------------
# 2. Load dataset
# -----------------------------
data = pd.read_csv(r'D:\MUFG-Hackathon\Backend\Dataset\synthetic_insurance_50k.csv')

In [3]:
# -----------------------------
# 3. Feature Engineering
# -----------------------------
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'], dayfirst=True)
data['Policy End Date'] = pd.to_datetime(data['Policy End Date'], dayfirst=True)
data['Policy Duration'] = (data['Policy End Date'] - data['Policy Start Date']).dt.days
data['Claim Ratio'] = data['Claim Amount (AUD)'] / data['Annual Premium (AUD)']
data['Premium per Day'] = data['Annual Premium (AUD)'] / data['Policy Duration']

bins = [20,30,40,50,60,70,80]
labels = ['21-30','31-40','41-50','51-60','61-70','71-80']
data['Age Group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

data = data.drop(columns=['Policy Start Date','Policy End Date'])

cat_features = ['State','Insurance Type','Claim Status','Payment Frequency','Age Group']
for col in cat_features:
    data[col] = data[col].astype(str).fillna('Unknown')

In [4]:
# -----------------------------
# 4. Prepare Features and Labels
# -----------------------------
X = data.drop(columns=['Risk Score','Product Tier'])
y_reg = data['Risk Score']            # For regression
y_clf = data['Product Tier']          # For classification

# Encode categorical columns
X = pd.get_dummies(X, drop_first=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42
)

In [6]:
# -----------------------------
# 5. Grid Search CV for Regression (Risk Score)
# -----------------------------
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, CatBoostClassifier
import numpy as np

param_grid_reg = {
    'iterations': [300, 500],
    'depth': [6, 8],
    'learning_rate': [0.05, 0.1],
    'l2_leaf_reg': [3, 5, 7]
}

reg_model = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

grid_reg = GridSearchCV(estimator=reg_model,
                        param_grid=param_grid_reg,
                        cv=KFold(n_splits=5, shuffle=True, random_state=42),
                        scoring='neg_root_mean_squared_error',
                        n_jobs=-1)

grid_reg.fit(X_train, y_reg_train)

print("Best Params (Regression):", grid_reg.best_params_)
best_reg_model = grid_reg.best_estimator_

# Evaluate on test set
y_reg_pred = best_reg_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_reg_test, y_reg_pred))
print("Test RMSE:", rmse)

# Cross-validation RMSE
cv_scores = cross_val_score(best_reg_model, X, y_reg,
                            cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
print("Cross-Validation RMSE scores:", -cv_scores)
print("Mean CV RMSE:", -cv_scores.mean())

Best Params (Regression): {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 7, 'learning_rate': 0.05}
Test RMSE: 0.06955698431888786
Cross-Validation RMSE scores: [0.06960009 0.06905668 0.06914562 0.07010732 0.06955056]
Mean CV RMSE: 0.06949205390925293


In [7]:
# -----------------------------
# 6. Grid Search CV for Classification (Product Tier)
# -----------------------------
from sklearn.metrics import accuracy_score, classification_report

param_grid_clf = {
    'iterations': [300, 500],
    'depth': [6, 8],
    'learning_rate': [0.05, 0.1],
    'l2_leaf_reg': [3, 5, 7]
}

clf_model = CatBoostClassifier(loss_function='MultiClass', verbose=0, random_state=42)

grid_clf = GridSearchCV(estimator=clf_model,
                        param_grid=param_grid_clf,
                        cv=KFold(n_splits=5, shuffle=True, random_state=42),
                        scoring='accuracy',
                        n_jobs=-1)

grid_clf.fit(X_train, y_clf_train)

print("\nBest Params (Classification):", grid_clf.best_params_)
best_clf_model = grid_clf.best_estimator_

# Evaluate on test set
y_clf_pred = best_clf_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_clf_test, y_clf_pred))
print("\nClassification Report:\n", classification_report(y_clf_test, y_clf_pred))

# Cross-validation Accuracy
cv_scores_clf = cross_val_score(best_clf_model, X, y_clf,
                                cv=5, scoring='accuracy', n_jobs=-1)
print("Cross-Validation Accuracy scores:", cv_scores_clf)
print("Mean CV Accuracy:", cv_scores_clf.mean())


Best Params (Classification): {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
Test Accuracy: 0.6768

Classification Report:
               precision    recall  f1-score   support

       Basic       0.51      0.74      0.60      2461
        Gold       0.90      0.78      0.84      2479
     Premium       0.79      0.84      0.81      2537
    Standard       0.55      0.35      0.43      2523

    accuracy                           0.68     10000
   macro avg       0.69      0.68      0.67     10000
weighted avg       0.69      0.68      0.67     10000

Cross-Validation Accuracy scores: [0.6769 0.6784 0.6777 0.6811 0.6778]
Mean CV Accuracy: 0.67838


In [8]:
# -----------------------------
# Map Regression Predictions to Tiers (based on dataset means)
# -----------------------------

# Mean risk scores per tier from dataset
tier_means = {
    "Basic": 0.313,
    "Standard": 0.313,   # practically identical mean with Basic
    "Premium": 0.511,
    "Gold": 0.695
}

def risk_to_tier(score):
    # Find tier whose mean is closest to predicted score
    closest_tier = min(tier_means, key=lambda t: abs(tier_means[t] - score))
    return closest_tier

# Use regression model predictions (already trained: best_reg_model)
y_reg_pred_scores = best_reg_model.predict(X_test)

# Convert regression predictions to tiers
y_reg_as_tier = [risk_to_tier(s) for s in y_reg_pred_scores]

# Evaluate vs true labels
print("Tier Prediction Accuracy (via Regression Mapping):",
      accuracy_score(y_clf_test, y_reg_as_tier))

print("\nClassification Report:\n",
      classification_report(y_clf_test, y_reg_as_tier))


Tier Prediction Accuracy (via Regression Mapping): 0.5422

Classification Report:
               precision    recall  f1-score   support

       Basic       0.46      0.58      0.52      2461
        Gold       0.76      0.94      0.84      2479
     Premium       0.43      0.65      0.52      2537
    Standard       0.00      0.00      0.00      2523

    accuracy                           0.54     10000
   macro avg       0.41      0.54      0.47     10000
weighted avg       0.41      0.54      0.47     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
