In [1]:
# Task 4: Predictive Modeling for Risk-Based Insurance Pricing

# === Step 1: Load Libraries ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import shap
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === Step 2: Load Dataset ===
df = pd.read_csv("../data/MachineLearningRating_v3_cleaned_task3.csv")

In [5]:
#display first few rows
print(df.head())



   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...             Mobility - 

In [6]:
#display dataset info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 50 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      1000098 non-null  object 
 9   AccountType               1000098 non-null  object 
 10  MaritalStatus             1000098 non-null  object 
 11  Gender                    1000098 non-null  object 
 12  Country                   1000098 non-null  object 
 13  Province                  1

In [7]:
#check for missing values and generate in percentage
missing_percentage = df.isnull().mean() * 100   
print("Missing values in percentage:\n", missing_percentage)
#check for null values  
print(df.isnull().sum())


Missing values in percentage:
 UnderwrittenCoverID         0.0
PolicyID                    0.0
TransactionMonth            0.0
IsVATRegistered             0.0
Citizenship                 0.0
LegalType                   0.0
Title                       0.0
Language                    0.0
Bank                        0.0
AccountType                 0.0
MaritalStatus               0.0
Gender                      0.0
Country                     0.0
Province                    0.0
PostalCode                  0.0
MainCrestaZone              0.0
SubCrestaZone               0.0
ItemType                    0.0
mmcode                      0.0
VehicleType                 0.0
RegistrationYear            0.0
make                        0.0
Model                       0.0
Cylinders                   0.0
cubiccapacity               0.0
kilowatts                   0.0
bodytype                    0.0
NumberOfDoors               0.0
VehicleIntroDate            0.0
CustomValueEstimate         0.0
AlarmImmo

In [8]:
# Filter out rows with TotalPremium = 0 or null to avoid divide-by-zero
if 'TotalPremium' in df.columns and 'TotalClaims' in df.columns:
    df = df[(df['TotalPremium'] != 0) & (~df['TotalPremium'].isnull())]
    df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

In [10]:
# Convert all object or categorical columns to string type explicitly
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = df[col].astype(str)


In [11]:
# Convert all object or categorical columns to string type explicitly
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = df[col].astype(str)


In [12]:
# Create binary target for classification
df['ClaimOccurred'] = (df['TotalClaims'] > 0).astype(int)

# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

In [13]:
# === Step 4: Claim Severity Prediction (Regression on TotalClaims > 0) ===
severity_df = df[df['TotalClaims'] > 0]
X_sev = severity_df.drop(['TotalClaims', 'CalculatedPremiumPerTerm', 'ClaimOccurred'], axis=1)
y_sev = severity_df['TotalClaims']
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(X_sev, y_sev, test_size=0.2, random_state=42)


In [14]:
# Models
models_reg = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}


In [15]:
print("\n--- Claim Severity Prediction ---")
for name, model in models_reg.items():
    model.fit(X_train_sev, y_train_sev)
    y_pred = model.predict(X_test_sev)
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_test_sev, y_pred)):.4f}, R2: {r2_score(y_test_sev, y_pred):.4f}")



--- Claim Severity Prediction ---
LinearRegression RMSE: 23749.0316, R2: 0.4316
RandomForest RMSE: 3963.9337, R2: 0.9842
XGBoost RMSE: 7550.8374, R2: 0.9425


In [16]:
# === Step 5: Premium Prediction (Regression) ===
X_prem = df.drop(['CalculatedPremiumPerTerm', 'TotalClaims', 'ClaimOccurred'], axis=1)
y_prem = df['CalculatedPremiumPerTerm']
X_train_prem, X_test_prem, y_train_prem, y_test_prem = train_test_split(X_prem, y_prem, test_size=0.2, random_state=42)

print("\n--- Premium Prediction ---")
for name, model in models_reg.items():
    model.fit(X_train_prem, y_train_prem)
    y_pred = model.predict(X_test_prem)
    print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_test_prem, y_pred)):.4f}, R2: {r2_score(y_test_prem, y_pred):.4f}")



--- Premium Prediction ---
LinearRegression RMSE: 41.8744, R2: 0.9882
RandomForest RMSE: 14.3140, R2: 0.9986
XGBoost RMSE: 13.5370, R2: 0.9988


In [17]:
# === Step 6: Claim Probability Prediction (Classification) ===
X_clf = df.drop(['ClaimOccurred', 'TotalClaims', 'CalculatedPremiumPerTerm'], axis=1)
y_clf = df['ClaimOccurred']
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

models_clf = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

print("\n--- Claim Probability Prediction ---")
for name, model in models_clf.items():
    model.fit(X_train_clf, y_train_clf)
    y_pred = model.predict(X_test_clf)
    print(f"{name} Accuracy: {accuracy_score(y_test_clf, y_pred):.4f}, Precision: {precision_score(y_test_clf, y_pred):.4f}, Recall: {recall_score(y_test_clf, y_pred):.4f}, F1: {f1_score(y_test_clf, y_pred):.4f}")



--- Claim Probability Prediction ---
LogisticRegression Accuracy: 0.9956, Precision: 0.6000, Recall: 0.0055, F1: 0.0109
RandomForest Accuracy: 1.0000, Precision: 1.0000, Recall: 0.9982, F1: 0.9991
XGBoost Accuracy: 0.9997, Precision: 0.9980, Recall: 0.9282, F1: 0.9618
