In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb

claims_data = pd.read_excel("claims_data.xlsx")

In [4]:
claims_data.loc[claims_data["Notification_period"] < 0, "Notification_period"] = np.nan
claims_data.loc[claims_data["PH_considered_TP_at_fault"] == "#", "PH_considered_TP_at_fault"] = np.nan

In [5]:
claims_data.drop(columns=["Claim Number", "date_of_loss", "Loss_code", "Loss_description", "Capped Incurred"], inplace=True)

In [6]:
#change the columns with object datatype to category
for col in claims_data.select_dtypes(include=["object"]).columns:
    claims_data[col] = claims_data[col].astype("category")
   

In [24]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_categorical_columns(df: pd.DataFrame) -> (pd.DataFrame, OneHotEncoder):
    """
    One-hot encodes all categorical columns in the DataFrame using OneHotEncoder.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing columns to be encoded.
    
    Returns:
    pd.DataFrame: The DataFrame with categorical columns one-hot encoded.
    """
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    encoded_df = pd.DataFrame(ohe.fit_transform(df[categorical_columns]))
    
    # Name the columns appropriately
    encoded_df.columns = ohe.get_feature_names_out(categorical_columns)
    df = df.drop(columns=categorical_columns).reset_index(drop=True)
    encoded_df = encoded_df.reset_index(drop=True)
    
    return pd.concat([df, encoded_df], axis=1), ohe

encoded_claims, encoder = one_hot_encode_categorical_columns(claims_data)

In [28]:
encoder.get_feature_names_out()

array(['Notifier_NamedDriver', 'Notifier_Other', 'Notifier_PH',
       'Notifier_TP', 'Location_of_incident_Home Address',
       'Location_of_incident_Main Road',
       'Location_of_incident_Minor Road', 'Location_of_incident_Motorway',
       'Location_of_incident_Not Applicable',
       'Location_of_incident_Other', 'Location_of_incident_n/k',
       'Weather_conditions_NORMAL', 'Weather_conditions_SNOW,ICE,FOG',
       'Weather_conditions_WET', 'Weather_conditions_nan',
       'Vehicle_mobile_Y', 'Vehicle_mobile_n/k', 'Main_driver_Other',
       'Main_driver_Y', 'PH_considered_TP_at_fault_Y',
       'PH_considered_TP_at_fault_n/k', 'PH_considered_TP_at_fault_nan'],
      dtype=object)

In [23]:
encoded_claims

Unnamed: 0,Notification_period,Inception_to_loss,Time_hour,Vechile_registration_present,Incident_details_present,Injury_details_present,TP_type_insd_pass_back,TP_type_insd_pass_front,TP_type_driver,TP_type_pass_back,...,"Weather_conditions_SNOW,ICE,FOG",Weather_conditions_WET,Weather_conditions_nan,Vehicle_mobile_Y,Vehicle_mobile_n/k,Main_driver_Other,Main_driver_Y,PH_considered_TP_at_fault_Y,PH_considered_TP_at_fault_n/k,PH_considered_TP_at_fault_nan
0,22.0,13,10,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,9,18,1,1,0,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,5.0,17,16,1,0,0,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,23,14,1,1,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,48,9,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7686,1.0,83,16,1,1,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7687,0.0,25,14,1,1,1,0,0,1,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7688,0.0,60,9,1,1,0,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7689,1.0,253,19,1,1,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, temp = train_test_split(claims_data, test_size=0.25, random_state=32, shuffle=True)
val, test = train_test_split(temp, test_size=0.4, random_state=32, shuffle=True)

# Display the results
print("Training data size:", len(train) / (len(train) + len(val) + len(test)) )
print("Validation data size:", len(val) / (len(train) + len(val) + len(test)))
print("Testing data size:", len(test)/ (len(train) + len(val) + len(test)))

Training data size: 0.7499674944740606
Validation data size: 0.14991548563255752
Testing data size: 0.10011701989338187


In [9]:
from collections import namedtuple
DataSet = namedtuple('DataSet', ['features', 'target'])
train_set = DataSet(features=train.drop(columns="Incurred"), target=train["Incurred"])
val_set = DataSet(features=val.drop(columns="Incurred"), target=val["Incurred"])
test_set = DataSet(features=test.drop(columns ="Incurred") , target=test["Incurred"])
train_d_matrix = xgb.DMatrix(train_set.features, label=train_set.target, enable_categorical=True)
val_d_matrix = xgb.DMatrix(val_set.features, label=val_set.target, enable_categorical=True)
test_d_matrix = xgb.DMatrix(test_set.features, label=test_set.target, enable_categorical=True)

In [54]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from flaml import AutoML

automl = AutoML()

automl_settings = {
    "time_budget": 3600,  # total running time in seconds
    "metric": 'mae',  # metric to optimize
    "task": 'regression',  # task type
    "n_splits": 5,  # number of splits in time cross-validation
    "sample": True,  # enable sampling
    "estimator_list": ['xgboost',"catboost","xgb_limitdepth","histgb"],  # list of ML algorithms to use
    "log_file_name": 'flaml.log',  # log file
    "eval_method": "cv",  # cross-validation
    "max_iter": 200,  # maximum number of iterations
    "early_stop": True,  # enable early stopping
    "n_jobs": 4,  # number of parallel jobs
    "ensemble": True,  # use ensemble methods
}

automl.fit(X_train=train_set.features, y_train=train_set.target, **automl_settings)
print('Best hyperparameters:', automl.best_config)

best_params = automl.best_config
best_model = xgb.train(best_params, train_d_matrix, num_boost_round=100)

val_predictions = best_model.predict(val_d_matrix)
mae = mean_absolute_error(val_set.target, val_predictions)
print(f'Validation MAE: {mae}')

test_predictions = best_model.predict(test_d_matrix)
mse = mean_squared_error(test_set.target, test_predictions)
print(f'Test MAE: {mae}')

[flaml.automl.logger: 06-09 22:09:00] {1680} INFO - task = regression
[flaml.automl.logger: 06-09 22:09:00] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-09 22:09:00] {1789} INFO - Minimizing error metric: mae
[flaml.automl.logger: 06-09 22:09:00] {1901} INFO - List of ML learners in AutoML Run: ['xgboost', 'catboost', 'xgb_limitdepth', 'histgb']
[flaml.automl.logger: 06-09 22:09:00] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 06-09 22:09:01] {2345} INFO - Estimated sufficient time budget=4039s. Estimated necessary time budget=21s.
[flaml.automl.logger: 06-09 22:09:01] {2392} INFO -  at 0.4s,	estimator xgboost's best error=12516.3445,	best estimator xgboost's best error=12516.3445
[flaml.automl.logger: 06-09 22:09:01] {2219} INFO - iteration 1, current learner histgb
[flaml.automl.logger: 06-09 22:09:01] {2392} INFO -  at 0.6s,	estimator histgb's best error=12152.4122,	best estimator histgb's best error=12152.4122
[flaml.automl.logger: 06-09 

Parameters: { "n_estimators" } are not used.

