In [46]:
import pandas as pd
import numpy as np
import xgboost as xgb

claims_data = pd.read_excel("claims_data.xlsx")

In [47]:
claims_data.loc[claims_data["Notification_period"] < 0, "Notification_period"] = np.nan
claims_data.loc[claims_data["PH_considered_TP_at_fault"] == "#", "PH_considered_TP_at_fault"] = np.nan

In [48]:
claims_data.drop(columns=["Claim Number", "date_of_loss", "Loss_code", "Loss_description", "Capped Incurred"], inplace=True)

In [49]:
for col in claims_data.select_dtypes(include=["object"]).columns:
    claims_data[col] = claims_data[col].astype("category")
   

In [50]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_categorical_columns(df: pd.DataFrame) -> (pd.DataFrame, OneHotEncoder):
    """
    One-hot encodes all categorical columns in the DataFrame using OneHotEncoder.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame containing columns to be encoded.
    
    Returns:
    pd.DataFrame: The DataFrame with categorical columns one-hot encoded.
    """
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    encoded_df = pd.DataFrame(ohe.fit_transform(df[categorical_columns]))
    

    encoded_df.columns = ohe.get_feature_names_out(categorical_columns)
    df = df.drop(columns=categorical_columns).reset_index(drop=True)
    encoded_df = encoded_df.reset_index(drop=True)
    
    return pd.concat([df, encoded_df], axis=1), ohe

encoded_claims, encoder = one_hot_encode_categorical_columns(claims_data)

In [51]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(claims_data, test_size=0.25, random_state=32, shuffle=True)
val, test = train_test_split(temp, test_size=0.4, random_state=32, shuffle=True)

# Display the results
print("Training data size:", len(train) / (len(train) + len(val) + len(test)) )
print("Validation data size:", len(val) / (len(train) + len(val) + len(test)))
print("Testing data size:", len(test)/ (len(train) + len(val) + len(test)))

Training data size: 0.7499674944740606
Validation data size: 0.14991548563255752
Testing data size: 0.10011701989338187


In [52]:
from collections import namedtuple
DataSet = namedtuple('DataSet', ['features', 'target'])
train_set = DataSet(features=train.drop(columns="Incurred"), target=train["Incurred"])
val_set = DataSet(features=val.drop(columns="Incurred"), target=val["Incurred"])
test_set = DataSet(features=test.drop(columns ="Incurred") , target=test["Incurred"])
train_d_matrix = xgb.DMatrix(train_set.features, label=train_set.target, enable_categorical=True)
val_d_matrix = xgb.DMatrix(val_set.features, label=val_set.target, enable_categorical=True)
test_d_matrix = xgb.DMatrix(test_set.features, label=test_set.target, enable_categorical=True)

In [53]:
from random import randint, uniform
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from flaml import AutoML

automl = AutoML()

automl_settings = {
    "time_budget": 60, 
    "metric": 'mae', 
    "task": 'regression', 
    "n_splits": 5, 
    "sample": True, 
    "estimator_list": ['xgboost'],  
    "log_file_name": 'flaml.log', 
    "eval_method": "cv", 
    "max_iter": 200, 
    "early_stop": True,  
    "n_jobs": 4,
    "ensemble": True,
    "custom_hp": {
        'xgboost': {
            'max_depth': {'domain': randint(3, 10), 'init_value': 6},
            'subsample': {'domain': uniform(0.6, 1.0), 'init_value': 0.8},
            'learning_rate': {'domain': uniform(0.1, 0.3), 'init_value': 0.1},
        },
    }
}

automl.fit(X_train=train_set.features, y_train=train_set.target, **automl_settings)
print('Best hyperparameters:', automl.best_config)

best_params = automl.best_config
best_model = xgb.train(best_params, train_d_matrix, num_boost_round=100)

val_predictions = best_model.predict(val_d_matrix)
mae = mean_absolute_error(val_set.target, val_predictions)
print(f'Validation MAE: {mae}')

test_predictions = best_model.predict(test_d_matrix)
mse = mean_absolute_error(test_set.target, test_predictions)
print(f'Test MAE: {mae}')

[flaml.automl.logger: 06-12 19:36:58] {1680} INFO - task = regression
[flaml.automl.logger: 06-12 19:36:58] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-12 19:36:58] {1789} INFO - Minimizing error metric: mae
[flaml.automl.logger: 06-12 19:36:58] {1901} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 06-12 19:36:58] {2219} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 06-12 19:36:58] {2345} INFO - Estimated sufficient time budget=4629s. Estimated necessary time budget=5s.
[flaml.automl.logger: 06-12 19:36:58] {2392} INFO -  at 0.5s,	estimator xgboost's best error=11951.3111,	best estimator xgboost's best error=11951.3111
[flaml.automl.logger: 06-12 19:36:58] {2219} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 06-12 19:36:59] {2392} INFO -  at 0.9s,	estimator xgboost's best error=11951.3111,	best estimator xgboost's best error=11951.3111
[flaml.automl.logger: 06-12 19:36:59] {2219} INFO - iteration 2, c

Parameters: { "n_estimators" } are not used.



In [54]:
train_set.target.mean()

9858.126561618863

In [55]:
mean_array = np.full(len(val_set.target), train_set.target.mean())

In [56]:
mean_absolute_error(val_set.target, mean_array)

10361.366256205712