In [25]:
# -----------------------------
# 1. Imports
# -----------------------------
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

In [26]:
# -----------------------------
# 2. Load dataset
# -----------------------------
data = pd.read_csv(r'D:\MUFG-Hackathon\Backend\Notebooks\synthetic_insurance_50k.csv')

In [27]:
# -----------------------------
# 3. Feature Engineering
# -----------------------------
# Convert dates
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'], dayfirst=True)
data['Policy End Date'] = pd.to_datetime(data['Policy End Date'], dayfirst=True)

# Policy Duration
data['Policy Duration'] = (data['Policy End Date'] - data['Policy Start Date']).dt.days

# Claim Ratio and Premium per Day
data['Claim Ratio'] = data['Claim Amount (AUD)'] / data['Annual Premium (AUD)']
data['Premium per Day'] = data['Annual Premium (AUD)'] / data['Policy Duration']

# Age Group
bins = [20, 30, 40, 50, 60, 70, 80]
labels = ['21-30','31-40','41-50','51-60','61-70','71-80']
data['Age Group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

# Drop original dates
data = data.drop(columns=['Policy Start Date','Policy End Date'])

# Categorical columns
cat_features = ['State','Insurance Type','Claim Status','Payment Frequency','Age Group']

# Ensure categorical features are string and fill NaNs
for col in cat_features:
    data[col] = data[col].astype(str).fillna('Unknown')

In [28]:
# -----------------------------
# 4. Predict Risk Score (Regression)
# -----------------------------
X_risk = data.drop(columns=['Risk Score','Product Tier'])
y_risk = data['Risk Score']

# Train-test split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_risk, y_risk, test_size=0.2, random_state=42)

# Train CatBoost Regressor
risk_model = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    verbose=50
)

train_pool_risk = Pool(Xr_train, yr_train, cat_features=cat_features)
test_pool_risk = Pool(Xr_test, yr_test, cat_features=cat_features)

risk_model.fit(train_pool_risk, eval_set=test_pool_risk, verbose=50)

# Predict Risk Score
yr_pred = risk_model.predict(Xr_test)
print("RMSE on Risk Score prediction:", np.sqrt(mean_squared_error(yr_test, yr_pred)))

0:	learn: 0.1910809	test: 0.1905852	best: 0.1905852 (0)	total: 46.6ms	remaining: 23.3s
50:	learn: 0.0697428	test: 0.0697563	best: 0.0697563 (50)	total: 2.55s	remaining: 22.5s
100:	learn: 0.0693578	test: 0.0695586	best: 0.0695586 (100)	total: 4.81s	remaining: 19s
150:	learn: 0.0691106	test: 0.0695079	best: 0.0695052 (146)	total: 7.27s	remaining: 16.8s
200:	learn: 0.0688670	test: 0.0694879	best: 0.0694855 (183)	total: 9.85s	remaining: 14.6s
250:	learn: 0.0686696	test: 0.0694929	best: 0.0694855 (183)	total: 12.4s	remaining: 12.3s
300:	learn: 0.0684810	test: 0.0694992	best: 0.0694855 (183)	total: 15.2s	remaining: 10s
350:	learn: 0.0682986	test: 0.0695354	best: 0.0694855 (183)	total: 17.9s	remaining: 7.6s
400:	learn: 0.0681385	test: 0.0695447	best: 0.0694855 (183)	total: 20.8s	remaining: 5.14s
450:	learn: 0.0679340	test: 0.0695704	best: 0.0694855 (183)	total: 24.1s	remaining: 2.62s
499:	learn: 0.0677701	test: 0.0695787	best: 0.0694855 (183)	total: 28.4s	remaining: 0us

bestTest = 0.06948547

In [29]:
# -----------------------------
# 5. Predict Product Tier (Classification)
# -----------------------------
# Add predicted Risk Score as a feature
X_full = data.drop(columns=['Product Tier','Risk Score'])
data['Predicted Risk Score'] = risk_model.predict(X_full)

X_tier = data.drop(columns=['Product Tier','Risk Score'])
y_tier = data['Product Tier']

# Encode target
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y_tier)

# Train-test split
Xt_train, Xt_test, yt_train, yt_test = train_test_split(
    X_tier, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Ensure categorical features are string and fill NaNs
for col in cat_features:
    Xt_train[col] = Xt_train[col].astype(str).fillna('Unknown')
    Xt_test[col] = Xt_test[col].astype(str).fillna('Unknown')

# Train CatBoost Classifier
cat_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=50
)

train_pool_tier = Pool(Xt_train, yt_train, cat_features=cat_features)
test_pool_tier = Pool(Xt_test, yt_test, cat_features=cat_features)

cat_model.fit(train_pool_tier, eval_set=test_pool_tier, verbose=50)

# Evaluate
yt_pred = cat_model.predict(Xt_test).flatten().astype(int)
yt_test_decoded = le_y.inverse_transform(yt_test)
yt_pred_decoded = le_y.inverse_transform(yt_pred)

print("Accuracy for Product Tier:", accuracy_score(yt_test_decoded, yt_pred_decoded))
print("\nClassification Report:\n", classification_report(yt_test_decoded, yt_pred_decoded))

0:	learn: 1.2411198	test: 1.2410021	best: 1.2410021 (0)	total: 92.4ms	remaining: 46.1s
50:	learn: 0.5080492	test: 0.5087413	best: 0.5087413 (50)	total: 8.08s	remaining: 1m 11s
100:	learn: 0.4952542	test: 0.4983904	best: 0.4983904 (100)	total: 15.7s	remaining: 1m 1s
150:	learn: 0.4884749	test: 0.4946043	best: 0.4946043 (150)	total: 24.7s	remaining: 57s
200:	learn: 0.4840264	test: 0.4932643	best: 0.4932643 (200)	total: 31.2s	remaining: 46.4s
250:	learn: 0.4797522	test: 0.4925471	best: 0.4925189 (249)	total: 37.2s	remaining: 37s
300:	learn: 0.4758617	test: 0.4918811	best: 0.4917998 (296)	total: 43.3s	remaining: 28.6s
350:	learn: 0.4719076	test: 0.4910924	best: 0.4910597 (326)	total: 48.9s	remaining: 20.8s
400:	learn: 0.4688651	test: 0.4909506	best: 0.4908606 (395)	total: 54.6s	remaining: 13.5s
450:	learn: 0.4653653	test: 0.4908722	best: 0.4907515 (421)	total: 1m	remaining: 6.58s
499:	learn: 0.4622038	test: 0.4906273	best: 0.4906273 (499)	total: 1m 6s	remaining: 0us

bestTest = 0.490627295

In [30]:
# -----------------------------
# 6. Function to recommend tier for a new user
# -----------------------------
def recommend_tier(user_input_dict, top_n=2):
    df = pd.DataFrame([user_input_dict])
    
    # Feature engineering
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], dayfirst=True)
    df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], dayfirst=True)
    df['Policy Duration'] = (df['Policy End Date'] - df['Policy Start Date']).dt.days
    df['Claim Ratio'] = df['Claim Amount (AUD)'] / df['Annual Premium (AUD)']
    df['Premium per Day'] = df['Annual Premium (AUD)'] / df['Policy Duration']
    
    bins = [20, 30, 40, 50, 60, 70, 80]
    labels = ['21-30','31-40','41-50','51-60','61-70','71-80']
    df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    
    df = df.drop(columns=['Policy Start Date','Policy End Date'])
    
    # Convert categorical features
    for col in cat_features:
        df[col] = df[col].astype(str).fillna('Unknown')
    
    # Predict Risk Score first
    df['Predicted Risk Score'] = risk_model.predict(df)
    
    # Predict Product Tier probabilities
    proba = cat_model.predict_proba(df)[0]
    top_indices = np.argsort(proba)[::-1][:top_n]
    recommendations = [(le_y.classes_[i], proba[i]) for i in top_indices]
    
    # Explanation
    explanation = []
    for tier, p in recommendations:
        reason = f"Predicted Risk Score: {df['Predicted Risk Score'][0]:.2f}, " \
                 f"Annual Premium: {user_input_dict['Annual Premium (AUD)']}, " \
                 f"Insurance Type: {user_input_dict['Insurance Type']} → '{tier}' recommended (prob={p:.2f})"
        explanation.append(reason)
    
    return recommendations, explanation

In [31]:
# -----------------------------
# 7. Example for new user
# -----------------------------
new_user = {
    'Age': 21,
    'State': 'SA',
    'Insurance Type': 'Health',
    'Annual Premium (AUD)': 2500,
    'Claim Amount (AUD)': 12000,
    'Claim Status': 'Approved',
    'Payment Frequency': 'Monthly',
    'Policy Start Date': '01-01-2024',
    'Policy End Date': '31-12-2024'
}

recommendations, explanation = recommend_tier(new_user, top_n=2)

print("Top Recommendations:")
for tier, prob in recommendations:
    print(f"{tier}: {prob:.2f}")

print("\nExplanation:")
for line in explanation:
    print(line)

Top Recommendations:
Standard: 0.57
Basic: 0.43

Explanation:
Predicted Risk Score: 0.48, Annual Premium: 2500, Insurance Type: Health → 'Standard' recommended (prob=0.57)
Predicted Risk Score: 0.48, Annual Premium: 2500, Insurance Type: Health → 'Basic' recommended (prob=0.43)
