Build and evaluate predictive models that form the core of a dynamic, risk-based pricing system.

In [5]:
import pandas as pd
import numpy as np
import sys
import os 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append(os.path.abspath("../scripts"))
import warnings
warnings.filterwarnings("ignore")

In [6]:
# %%
os.chdir("..")  # Go up a directory
#print(os.getcwd())

In [7]:
from load_dataset import load_data

In [34]:
df = load_data("Data/MachineLearningRating_v3.txt")

✅ Loaded total 1000098 rows.


In [35]:
def evaluate_model_performance(y_true, y_pred, model_name="Model"):
    """Evaluate and print RMSE and R² for a regression model."""
    try:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        print(f" {model_name} Performance:")
        print(f"   RMSE: {rmse:.2f}")
        print(f"   R²:   {r2:.2f}")
        return {"Model": model_name, "RMSE": rmse, "R2": r2}
    except Exception as e:
        print(f" Error evaluating {model_name}: {e}")
        return None


In [36]:
print(df.columns.tolist())

['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims']


In [37]:
# Filter rows with claims > 0
df_claims = df[df['TotalClaims'] > 0].copy()

# Drop rows with missing values in predictors or target
df_claims.dropna(subset=['TotalClaims', 'CalculatedPremiumPerTerm', 'CustomValueEstimate'], inplace=True)

# Select features and target
features = ['CalculatedPremiumPerTerm', 'CustomValueEstimate']
X = df_claims[features]
y = df_claims['TotalClaims']


In [39]:
def evaluate_models(X, y):
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        models = {
            "Linear Regression": LinearRegression(),
            "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
            "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
        }

        results = {}

        for name, model in models.items():
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            perf = evaluate_model_performance(y_test, preds, model_name=name)
            results[name] = perf

        return models["XGBoost"], X_test, results  # XGBoost assumed best
    except Exception as e:
        print(f"❌ Model evaluation failed: {e}")
        return None, None, None


In [40]:
best_model, X_sample, performance_report = evaluate_models(X, y)

if performance_report:
    print("\n Model Performance Summary:")
    for model, metrics in performance_report.items():
        print(f" - {model}: RMSE = {metrics['RMSE']:.2f}, R² = {metrics['R2']:.2f}")


 Linear Regression Performance:
   RMSE: 42175.27
   R²:   0.25
 Random Forest Performance:
   RMSE: 47313.96
   R²:   0.05
 XGBoost Performance:
   RMSE: 49037.09
   R²:   -0.02

 Model Performance Summary:
 - Linear Regression: RMSE = 42175.27, R² = 0.25
 - Random Forest: RMSE = 47313.96, R² = 0.05
 - XGBoost: RMSE = 49037.09, R² = -0.02
