In [54]:
import pandas as pd
import numpy as np
import xgboost as xgb

claims_data = pd.read_excel("claims_data.xlsx")

In [55]:
claims_data.loc[claims_data["Notification_period"] < 0, "Notification_period"] = np.nan
claims_data.loc[claims_data["PH_considered_TP_at_fault"] == "#", "PH_considered_TP_at_fault"] = np.nan

In [56]:
claims_data.drop(columns=["Claim Number", "date_of_loss", "Loss_code", "Loss_description", "Capped Incurred"], inplace=True)

In [57]:
#change the columns with object datatype to category
for col in claims_data.select_dtypes(include=["object"]).columns:
    claims_data[col] = claims_data[col].astype("category")
   

In [58]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_categorical_columns(df: pd.DataFrame) -> (pd.DataFrame, OneHotEncoder):
    """
    One-hot encodes all categorical columns in the DataFrame using OneHotEncoder.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing columns to be encoded.
    
    Returns:
    pd.DataFrame: The DataFrame with categorical columns one-hot encoded.
    """
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    encoded_df = pd.DataFrame(ohe.fit_transform(df[categorical_columns]))
    
    # Name the columns appropriately
    encoded_df.columns = ohe.get_feature_names_out(categorical_columns)
    df = df.drop(columns=categorical_columns).reset_index(drop=True)
    encoded_df = encoded_df.reset_index(drop=True)
    
    return pd.concat([df, encoded_df], axis=1), ohe

encoded_claims, encoder = one_hot_encode_categorical_columns(claims_data)

In [59]:
encoder.get_feature_names_out()

array(['Notifier_NamedDriver', 'Notifier_Other', 'Notifier_PH',
       'Notifier_TP', 'Location_of_incident_Home Address',
       'Location_of_incident_Main Road',
       'Location_of_incident_Minor Road', 'Location_of_incident_Motorway',
       'Location_of_incident_Not Applicable',
       'Location_of_incident_Other', 'Location_of_incident_n/k',
       'Weather_conditions_NORMAL', 'Weather_conditions_SNOW,ICE,FOG',
       'Weather_conditions_WET', 'Weather_conditions_nan',
       'Vehicle_mobile_Y', 'Vehicle_mobile_n/k', 'Main_driver_Other',
       'Main_driver_Y', 'PH_considered_TP_at_fault_Y',
       'PH_considered_TP_at_fault_n/k', 'PH_considered_TP_at_fault_nan'],
      dtype=object)

In [60]:
encoded_claims

Unnamed: 0,Notification_period,Inception_to_loss,Time_hour,Vechile_registration_present,Incident_details_present,Injury_details_present,TP_type_insd_pass_back,TP_type_insd_pass_front,TP_type_driver,TP_type_pass_back,TP_type_pass_front,TP_type_bike,TP_type_cyclist,TP_type_pass_multi,TP_type_pedestrian,TP_type_other,TP_type_nk,TP_injury_whiplash,TP_injury_traumatic,TP_injury_fatality,TP_injury_unclear,TP_injury_nk,TP_region_eastang,TP_region_eastmid,TP_region_london,TP_region_north,TP_region_northw,TP_region_outerldn,TP_region_scotland,TP_region_southe,TP_region_southw,TP_region_wales,TP_region_westmid,TP_region_yorkshire,Incurred,Notifier_NamedDriver,Notifier_Other,Notifier_PH,Notifier_TP,Location_of_incident_Home Address,Location_of_incident_Main Road,Location_of_incident_Minor Road,Location_of_incident_Motorway,Location_of_incident_Not Applicable,Location_of_incident_Other,Location_of_incident_n/k,Weather_conditions_NORMAL,"Weather_conditions_SNOW,ICE,FOG",Weather_conditions_WET,Weather_conditions_nan,Vehicle_mobile_Y,Vehicle_mobile_n/k,Main_driver_Other,Main_driver_Y,PH_considered_TP_at_fault_Y,PH_considered_TP_at_fault_n/k,PH_considered_TP_at_fault_nan
0,22.0,13,10,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.000000,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,9,18,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2801.308013,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,5.0,17,16,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1220.870390,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,23,14,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,3529.868026,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,48,9,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,3155.987923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7686,1.0,83,16,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,702.814301,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7687,0.0,25,14,1,1,1,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,42980.618893,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7688,0.0,60,9,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,5174.800838,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7689,1.0,253,19,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,30072.417085,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [61]:
from sklearn.model_selection import train_test_split

In [62]:
train, temp = train_test_split(claims_data, test_size=0.25, random_state=32, shuffle=True)
val, test = train_test_split(temp, test_size=0.4, random_state=32, shuffle=True)

# Display the results
print("Training data size:", len(train) / (len(train) + len(val) + len(test)) )
print("Validation data size:", len(val) / (len(train) + len(val) + len(test)))
print("Testing data size:", len(test)/ (len(train) + len(val) + len(test)))

Training data size: 0.7499674944740606
Validation data size: 0.14991548563255752
Testing data size: 0.10011701989338187


In [63]:
from collections import namedtuple
DataSet = namedtuple('DataSet', ['features', 'target'])
train_set = DataSet(features=train.drop(columns="Incurred"), target=train["Incurred"])
val_set = DataSet(features=val.drop(columns="Incurred"), target=val["Incurred"])
test_set = DataSet(features=test.drop(columns ="Incurred") , target=test["Incurred"])
train_d_matrix = xgb.DMatrix(train_set.features, label=train_set.target, enable_categorical=True)
val_d_matrix = xgb.DMatrix(val_set.features, label=val_set.target, enable_categorical=True)
test_d_matrix = xgb.DMatrix(test_set.features, label=test_set.target, enable_categorical=True)

In [64]:
from random import randint, uniform
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from flaml import AutoML

automl = AutoML()

automl_settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'mae',  # metric to optimize
    "task": 'regression',  # task type
    "n_splits": 5,  # number of splits in time cross-validation
    "sample": True,  # enable sampling
    "estimator_list": ['xgboost'],  # list of ML algorithms to use
    "log_file_name": 'flaml.log',  # log file
    "eval_method": "cv",  # cross-validation
    "max_iter": 200,  # maximum number of iterations
    "early_stop": True,  # enable early stopping
    "n_jobs": 4,  # number of parallel jobs
    "ensemble": True,
    "custom_hp": {
        'xgboost': {
            'n_estimators': {'domain': randint(50, 500), 'init_value': 100},
            'max_depth': {'domain': randint(3, 10), 'init_value': 6},
            'subsample': {'domain': uniform(0.6, 1.0), 'init_value': 0.8},
            'learning_rate': {'domain': uniform(0.1, 0.3), 'init_value': 0.1},
        },
    }
}

automl.fit(X_train=train_set.features, y_train=train_set.target, **automl_settings)
print('Best hyperparameters:', automl.best_config)

best_params = automl.best_config
best_model = xgb.train(best_params, train_d_matrix, num_boost_round=100)

val_predictions = best_model.predict(val_d_matrix)
mae = mean_absolute_error(val_set.target, val_predictions)
print(f'Validation MAE: {mae}')

test_predictions = best_model.predict(test_d_matrix)
mse = mean_absolute_error(test_set.target, test_predictions)
print(f'Test MAE: {mae}')

[flaml.automl.logger: 06-11 22:27:23] {1680} INFO - task = regression
[flaml.automl.logger: 06-11 22:27:23] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-11 22:27:23] {1789} INFO - Minimizing error metric: mae
[flaml.automl.logger: 06-11 22:27:23] {1901} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 06-11 22:27:23] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 06-11 22:27:34] {2345} INFO - Estimated sufficient time budget=105850s. Estimated necessary time budget=106s.
[flaml.automl.logger: 06-11 22:27:34] {2392} INFO -  at 10.6s,	estimator xgboost's best error=12166.7883,	best estimator xgboost's best error=12166.7883
[flaml.automl.logger: 06-11 22:27:34] {2219} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 06-11 22:27:44] {2392} INFO -  at 20.7s,	estimator xgboost's best error=12166.7883,	best estimator xgboost's best error=12166.7883
[flaml.automl.logger: 06-11 22:27:44] {2219} INFO - iteratio

Parameters: { "n_estimators" } are not used.



In [65]:
train_set.target.mean()

9858.126561618863

In [66]:
len(val_set.features)

1153

In [67]:
val_set.features["Avg_prediction"] = train_set.target.mean()

In [68]:
val_set.features

Unnamed: 0,Notifier,Notification_period,Inception_to_loss,Location_of_incident,Weather_conditions,Vehicle_mobile,Time_hour,Main_driver,PH_considered_TP_at_fault,Vechile_registration_present,Incident_details_present,Injury_details_present,TP_type_insd_pass_back,TP_type_insd_pass_front,TP_type_driver,TP_type_pass_back,TP_type_pass_front,TP_type_bike,TP_type_cyclist,TP_type_pass_multi,TP_type_pedestrian,TP_type_other,TP_type_nk,TP_injury_whiplash,TP_injury_traumatic,TP_injury_fatality,TP_injury_unclear,TP_injury_nk,TP_region_eastang,TP_region_eastmid,TP_region_london,TP_region_north,TP_region_northw,TP_region_outerldn,TP_region_scotland,TP_region_southe,TP_region_southw,TP_region_wales,TP_region_westmid,TP_region_yorkshire,Avg_prediction
1063,CNF,2.0,264,Main Road,NORMAL,N,14,Other,n/k,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,9858.126562
6441,Other,0.0,93,Minor Road,WET,N,14,Y,N,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,9858.126562
3066,NamedDriver,1.0,80,Minor Road,NORMAL,N,15,N,Y,1,1,0,0,0,2,1,0,0,0,0,0,0,0,1,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,9858.126562
1239,PH,1.0,203,Main Road,NORMAL,N,16,Other,n/k,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,9858.126562
5909,Other,0.0,85,n/k,NORMAL,Y,13,Y,N,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,9858.126562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1917,PH,0.0,66,Main Road,WET,Y,16,Other,n/k,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,9858.126562
2438,PH,0.0,324,Main Road,NORMAL,N,8,Other,n/k,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9858.126562
3848,PH,0.0,153,Minor Road,WET,N,18,Y,N,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,9858.126562
1983,TP,6.0,190,Main Road,N/K,n/k,0,Other,n/k,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,9858.126562


In [69]:
mean_absolute_error(val_set.target, val_set.features["Avg_prediction"])

# I beleive the MAE of the trained model last night was 6ksh so this is a material improvement

10361.366256205712