In [1]:
#%pip install catboost


In [2]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_log_error
)
import time
import warnings

In [3]:
REMOVE_VALUES_FROM = ['specialhealthexpenses','specialfixes','specialrehabilitation','specialadditionalinjury']
DROP_COLUMNS = ['driverage','vehicleage','accidentdate','claimdate','policereportfiled','witnesspresent','dominantinjury','vehicletype','weatherconditions','gender','numberofpassengers', 'accidentdescription', 'injurydescription']
CATEGORY_COLUMNS = ['accidenttype','exceptionalcircumstances', 'minorpsychologicalinjury', 'whiplash',]


def apply_tariff_bands(df):
    # create bins for injury prognosis based on the whiplash tariff scale
    for idx, row in df.iterrows():
        if df.at[idx, 'injuryprognosis'] <= 3:
            df.at[idx, 'injuryprognosis'] = 0
        elif 4 <= df.at[idx, 'injuryprognosis'] <= 6:
            df.at[idx, 'injuryprognosis'] = 1
        elif 7 <= df.at[idx, 'injuryprognosis'] <= 9:
            df.at[idx, 'injuryprognosis'] = 2
        elif 10 <= df.at[idx, 'injuryprognosis'] <= 12:
            df.at[idx, 'injuryprognosis'] = 3
        elif 13<= df.at[idx, 'injuryprognosis'] <= 15:
            df.at[idx, 'injuryprognosis'] = 4
        elif 16 <= df.at[idx, 'injuryprognosis'] <= 18:
            df.at[idx, 'injuryprognosis'] = 5
        elif 19 <= df.at[idx, 'injuryprognosis'] <= 24:
            df.at[idx, 'injuryprognosis'] = 6
        
        # if we return an injuryprognosis value of 7 
        # this must be flagged as prediction warning
        elif 25 <= df.at[idx, 'injuryprognosis']:
            df.at[idx, 'injuryprognosis'] = 7

    df['injuryprognosis'] = df['injuryprognosis'].astype('category')
    return df

#############

def drop_outlier_values(df):
    
    # remove rows with values in these columns - we must flag up prediction warning
    # when there are values in these columns exceeding a certain threshold (based on model MAPE score?)
    for idx, row in df.iterrows():
        for col in REMOVE_VALUES_FROM:
            if df.at[idx, col] != 0:
                df.drop(idx, inplace=True)
                break
    return df


def convert_columns_to_category(df):
    for col in CATEGORY_COLUMNS:
        if col in df.columns:
            df[col] = df[col].astype('category')
    return df

def drop_unwanted_columns(df):
    for col in DROP_COLUMNS:
        if col in df.columns:
            df.drop(columns=col, inplace=True)  
    return df


def log_convert_targets(df):
    # run this on the target column before predictions, to reduce skewness
    df[0] = np.log1p(df[0])
    return df

def inverse_log_convert_targets(df):
    # run this on the target column after predictions, to return to original scale
    df[0] = np.expm1(df[0])
    return df


df = pd.read_csv('clean_df.csv') # column names are using
df = drop_unwanted_columns(df)
df = drop_outlier_values(df)
df = apply_tariff_bands(df)
#df = convert_columns_to_category(df)

df.head()

Unnamed: 0,settlementvalue,injuryprognosis,whiplash,minorpsychologicalinjury,exceptionalcircumstances,generalfixed,generaluplift,generalrest,specialhealthexpenses,specialtherapy,...,specialearningsloss,specialusageloss,specialreduction,specialoverage,specialassetdamage,specialfixes,specialloanervehicle,specialtripcosts,specialjourneyexpenses,accidenttype
0,520,1,1,1,0,520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rear end
1,870,0,1,1,0,260,0,520,0,0,...,0,90,0,0,0,0,0,0,0,Rear end
2,2140,2,1,0,0,840,0,1400,0,0,...,0,0,0,0,0,0,0,0,0,Other side pulled out of side road
3,520,1,1,1,0,520,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rear end - Clt pushed into next vehicle
4,260,0,0,1,0,260,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rear end


In [4]:
# Suppress specific warnings if needed (e.g., future warnings from libraries)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Store original target before transforming (needed for inverse transform later)
# We don't actually need the original untransformed df IN the loop,
# but y_original is useful if we wanted to calculate metrics against original scale
# y_original_full = df['settlementvalue'].copy() # Optional: keep original scale if needed elsewhere

# Apply log1p to settlementvalue to reduce skewness
df['settlementvalue'] = np.log1p(df['settlementvalue'])

# Identify categorical columns that actually exist in the reduced dataframe
# existing_category_cols = [col for col in CATEGORY_COLUMNS if col in df.columns]

# for col in existing_category_cols:
#     df[col] = df[col].astype('category')



for col in CATEGORY_COLUMNS:
    if col in df.columns:
        df.drop(columns=col, inplace=True)  

df['injuryprognosis'] = df['injuryprognosis'].astype(int)      

df.head()

Unnamed: 0,settlementvalue,injuryprognosis,generalfixed,generaluplift,generalrest,specialhealthexpenses,specialtherapy,specialrehabilitation,specialmedications,specialadditionalinjury,specialearningsloss,specialusageloss,specialreduction,specialoverage,specialassetdamage,specialfixes,specialloanervehicle,specialtripcosts,specialjourneyexpenses
0,6.25575,1,520,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6.769642,0,260,0,520,0,0,0,0,0,0,90,0,0,0,0,0,0,0
2,7.669028,2,840,0,1400,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6.25575,1,520,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5.56452,0,260,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:

# Define Features (X) and Target (y - log-transformed)
X = df.drop("settlementvalue", axis=1)
y = df['settlementvalue'] # Log-transformed target

# --- 2. Set up K-Fold ---
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42) # Use random_state for reproducibility

# --- 3. Define Models ---
# Use random_state for reproducibility in models too

# XGBoost (using parameters from your example, removing eval bits)
xgb_model = xgboost.XGBRegressor(
    n_estimators=100,
    # eval_metric='rmsle', # Not needed directly here, we calculate manually
    # early_stopping_rounds=10, # Cannot easily use with simple KFold loop
    enable_categorical=True,
    verbosity=0, # Set verbosity=0 to keep output clean during CV
    random_state=42,
    n_jobs=-1 # Use all available CPU cores
)

# RandomForestRegressor
rf_model = RandomForestRegressor(
    n_estimators=100, # Match n_estimators for rough comparison
    random_state=42,
    n_jobs=-1, # Use all available CPU cores
    max_depth=None # Default: expand nodes until pure or min_samples_split
    # RF doesn't use 'enable_categorical'. It treats category codes as numbers.
)

# CatBoostRegressor
# Find indices of categorical features for CatBoost
categorical_features_indices = [X.columns.get_loc(col) for col in CATEGORY_COLUMNS if col in X.columns]


cat_model = CatBoostRegressor(
    iterations=100, # Equivalent to n_estimators
    random_state=42,
    loss_function='RMSE', # Standard loss for regression
    cat_features=categorical_features_indices,
    verbose=0 # Suppress verbose output during training
    # early_stopping_rounds=10 # Can be added but requires eval_set within the loop
)

df = drop_unwanted_columns(df)

models = {
    "XGBoost": xgb_model,
    "RandomForest": rf_model,
    "CatBoost": cat_model
}

# --- 4. Cross-Validation Loop ---

results = {} # Dictionary to store results for each model

for model_name, model in models.items():
    print(f"--- Running Cross-Validation for: {model_name} ---")
    fold_metrics = {
        'rmse': [],
        'mae': [],
        'r2': [],
        'mape': [],
        'rmsle': []
    }
    start_time = time.time()

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        # print(f"  Fold {fold+1}/{N_SPLITS}...") # Uncomment for progress tracking

        # Split data for this fold
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index] # These are log-transformed

        # Fit the model
        # Special handling for CatBoost if categorical features were specified at init
        if model_name == "CatBoost":
             # CatBoost already knows categorical features from initialization
             model.fit(X_train_fold, y_train_fold)
        elif model_name == "XGBoost":
             # XGBoost needs enable_categorical=True (set at init) and category dtype
             model.fit(X_train_fold, y_train_fold)
        else: # RandomForest
             model.fit(X_train_fold, y_train_fold)


        # Predict on the validation set (predictions are log-scale)
        log_preds_fold = model.predict(X_val_fold)

        # Inverse transform predictions AND actual validation values to original scale (£)
        actual_pounds_fold = np.expm1(y_val_fold)
        pred_pounds_fold = np.expm1(log_preds_fold)

        # Ensure predictions are non-negative
        pred_pounds_fold[pred_pounds_fold < 0] = 0
        actual_pounds_fold[actual_pounds_fold < 0] = 0 # Should not happen with log1p/expm1 but safe check

        # Calculate metrics on the original POUND scale
        fold_rmse = np.sqrt(mean_squared_error(actual_pounds_fold, pred_pounds_fold))
        fold_mae = mean_absolute_error(actual_pounds_fold, pred_pounds_fold)
        # Calculate R2 carefully - handle cases with zero variance or perfect prediction if necessary
        try:
             fold_r2 = r2_score(actual_pounds_fold, pred_pounds_fold)
        except ValueError:
             fold_r2 = np.nan # Handle potential issues if variance is zero etc.


        # Calculate MAPE - add epsilon for stability if actuals could be zero
        epsilon = 1e-8
        fold_mape = mean_absolute_percentage_error(actual_pounds_fold + epsilon, pred_pounds_fold) * 100 # Express as percentage

        # Calculate RMSLE (using the dedicated function which expects original scale)
        # Ensure no negatives passed to mean_squared_log_error (already handled above)
        fold_rmsle = np.sqrt(mean_squared_log_error(actual_pounds_fold, pred_pounds_fold))

        # Store metrics for this fold
        fold_metrics['rmse'].append(fold_rmse)
        fold_metrics['mae'].append(fold_mae)
        fold_metrics['r2'].append(fold_r2)
        fold_metrics['mape'].append(fold_mape)
        fold_metrics['rmsle'].append(fold_rmsle)

    # Store average results for this model
    results[model_name] = {
        'RMSE_mean': np.mean(fold_metrics['rmse']),
        'RMSE_std': np.std(fold_metrics['rmse']),
        'MAE_mean': np.mean(fold_metrics['mae']),
        'MAE_std': np.std(fold_metrics['mae']),
        'R2_mean': np.nanmean(fold_metrics['r2']), # Use nanmean in case any R2 was NaN
        'R2_std': np.nanstd(fold_metrics['r2']),
        'MAPE_mean': np.mean(fold_metrics['mape']),
        'MAPE_std': np.std(fold_metrics['mape']),
        'RMSLE_mean': np.mean(fold_metrics['rmsle']),
        'RMSLE_std': np.std(fold_metrics['rmsle']),
        'Total_Time_sec': time.time() - start_time
    }
    print(f"--- Finished {model_name} in {results[model_name]['Total_Time_sec']:.2f} seconds ---")


# --- 5. Aggregate & Report ---
print("\n--- Cross-Validation Results ({} folds) ---".format(N_SPLITS))

# Create a DataFrame for nice formatting
results_df = pd.DataFrame(results).T # Transpose to get models as rows
# Format columns for better readability
float_cols = [col for col in results_df.columns if 'mean' in col or 'std' in col]
results_df[float_cols] = results_df[float_cols].astype(float).round(4)
results_df['Total_Time_sec'] = results_df['Total_Time_sec'].astype(float).round(2)

# Display results - show mean and std for key metrics
print('Current Time:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print(results_df[['RMSLE_mean', 'RMSLE_std', 'RMSE_mean', 'RMSE_std', 'MAPE_mean', 'MAPE_std', 'R2_mean', 'R2_std', 'Total_Time_sec']])

--- Running Cross-Validation for: XGBoost ---
--- Finished XGBoost in 1.13 seconds ---
--- Running Cross-Validation for: RandomForest ---
--- Finished RandomForest in 2.02 seconds ---
--- Running Cross-Validation for: CatBoost ---
--- Finished CatBoost in 1.95 seconds ---

--- Cross-Validation Results (5 folds) ---
Current Time: 2025-05-03 07:57:47
              RMSLE_mean  RMSLE_std  RMSE_mean  RMSE_std  MAPE_mean  MAPE_std  \
XGBoost           0.1392     0.0122   229.4139   35.6973     6.6871    0.4079   
RandomForest      0.1409     0.0160   247.6740   53.7258     5.7603    0.6154   
CatBoost          0.1680     0.0084   273.4129   26.5418    10.1967    0.3926   

              R2_mean  R2_std  Total_Time_sec  
XGBoost        0.9246  0.0291            1.13  
RandomForest   0.9092  0.0478            2.02  
CatBoost       0.8949  0.0283            1.95  
