In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import shap
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
file_path = "MachineLearningRating_v3.txt"
df = pd.read_csv(file_path, sep="|")

In [4]:
# ------------------ DATA PREPARATION ------------------

# Drop rows with missing target variables
df = df.dropna(subset=['TotalClaims', 'CalculatedPremiumPerTerm'])

# Handle missing values for predictors
df.fillna(df.median(numeric_only=True), inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

In [5]:
# Feature engineering
df['margin'] = df['TotalPremium'] - df['TotalClaims']
df['claim_flag'] = df['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

In [6]:
# Subset for severity prediction (only rows where claims > 0)
df_severity = df[df['TotalClaims'] > 0].copy()

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# ------------------ SEVERITY MODELING ------------------

# Features and Target
X_sev = df_severity.drop(['TotalClaims', 'CalculatedPremiumPerTerm', 'claim_flag'], axis=1)
y_sev = df_severity['TotalClaims']

# Train-test split
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(X_sev, y_sev, test_size=0.2, random_state=42)

# Imputation strategy (you can use 'mean', 'median', or 'most_frequent')
imputer = SimpleImputer(strategy='median')

# Models wrapped in a pipeline with imputer
models_sev = {
    'LinearRegression': make_pipeline(imputer, LinearRegression()),
    'RandomForest': make_pipeline(imputer, RandomForestRegressor(random_state=42)),
    'XGBoost': make_pipeline(imputer, XGBRegressor(random_state=42))
}

print("\n--- Claim Severity Prediction Results ---")
for name, model in models_sev.items():
    model.fit(X_train_sev, y_train_sev)
    preds = model.predict(X_test_sev)
    mse = mean_squared_error(y_test_sev, preds)
    rmse = np.sqrt(mse)  # Manually compute RMSE
    r2 = r2_score(y_test_sev, preds)
    print(f"{name} -> RMSE: {rmse:.2f}, R^2: {r2:.2f}")




--- Claim Severity Prediction Results ---
LinearRegression -> RMSE: 0.00, R^2: 1.00
RandomForest -> RMSE: 4033.20, R^2: 0.99
XGBoost -> RMSE: 5184.11, R^2: 0.98


In [11]:

df_encoded = pd.get_dummies(df, drop_first=True)
X_clf = df_encoded.drop(['TotalClaims', 'CalculatedPremiumPerTerm'], axis=1, errors='ignore')
y_clf = df_encoded['claim_flag']
from sklearn.model_selection import train_test_split

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)




In [None]:
# ------------------ CLAIM PROBABILITY (CLASSIFICATION) ------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# --------- STEP 1: Prepare target and features ---------
# Assuming df is already loaded and has 'claim_flag' and 'TotalClaims'

# Drop unwanted target-related columns
X_clf = df.drop(['TotalClaims', 'CalculatedPremiumPerTerm'], axis=1, errors='ignore')
y_clf = df['claim_flag']

# Encode categorical columns
for col in X_clf.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_clf[col] = le.fit_transform(X_clf[col].astype(str))

# --------- STEP 2: Clean data ---------
# Convert nested or invalid objects to numeric (coerce errors into NaN)
X_clf = X_clf.apply(pd.to_numeric, errors='coerce')

# Drop rows where y_clf is NaN (just in case)
X_clf = X_clf[y_clf.notna()]
y_clf = y_clf[y_clf.notna()]

# Impute missing values with column means
imputer = SimpleImputer(strategy='mean')
X_clf = pd.DataFrame(imputer.fit_transform(X_clf), columns=X_clf.columns)

# --------- STEP 3: Sample (optional: reduce data size to avoid crashes) ---------
X_clf_sampled = X_clf.sample(n=2000, random_state=42) if len(X_clf) > 2000 else X_clf
y_clf_sampled = y_clf.loc[X_clf_sampled.index]

# --------- STEP 4: Train-test split ---------
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf_sampled, y_clf_sampled, test_size=0.2, random_state=42
)

# --------- STEP 5: Model definitions ---------
models_clf = {
    'LogisticRegression': LogisticRegression(max_iter=500, solver='liblinear'),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42),
    'XGBoostClassifier': XGBClassifier(n_estimators=50, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# --------- STEP 6: Train and Evaluate ---------
print("\n--- Claim Probability Prediction Results ---")
for name, model in models_clf.items():
    model.fit(X_train_clf, y_train_clf)
    preds = model.predict(X_test_clf)
    acc = accuracy_score(y_test_clf, preds)
    prec = precision_score(y_test_clf, preds)
    rec = recall_score(y_test_clf, preds)
    f1 = f1_score(y_test_clf, preds)
    print(f"{name} -> Acc: {acc:.2f}, Prec: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")


In [1]:
# ------------------ SHAP INTERPRETABILITY ------------------

# Fit best severity model (e.g., XGBoost)
best_model = XGBRegressor(random_state=42)
best_model.fit(X_train_sev, y_train_sev)
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test_sev)

NameError: name 'XGBRegressor' is not defined

In [None]:
# Plot SHAP summary
shap.summary_plot(shap_values, X_test_sev, max_display=10)

In [None]:
# ------------------ BUSINESS INSIGHT (Example) ------------------
print("\nExample SHAP Insight:")
print("The SHAP analysis shows that VehicleAge, VehicleType_SUV, and ClientAge are among the top features influencing TotalClaims. This suggests our pricing model should place higher risk loading on older vehicles and SUV-type vehicles.")