# First-tier classifier to predict total cost class

In [None]:
import os
import warnings
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import sklearn
import scipy
import scipy.stats as stats

from sklearn import base
from collections import defaultdict
from matplotlib.ticker import FixedLocator, FixedFormatter
from joblib import dump, load
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV, \
RandomizedSearchCV, cross_val_score, RepeatedStratifiedKFold, KFold
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, KernelPCA

from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
df = pd.read_excel("data.xlsx",sheet_name='Sheet1', usecols="A:AP")
df=df.dropna()

In [None]:
from pandas import read_csv
from imblearn.over_sampling import SMOTE
from collections import Counter
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
X = df.drop(columns=['Total_loss_cost_classification_code'])
y = df['Total_loss_cost_classification_code']
encoder = ce.OrdinalEncoder(cols=['Construction_type_classification_code','Accident_type_code', 'Work_process_classification_code','Injury_area_classification_code','Workers_affiliation','Integrated_occupation_classification_code','Direct_insurance_cost_category_code'])
X = encoder.fit_transform(X)
X = encoder.transform(X)
y = LabelEncoder().fit_transform(y)
# transform the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) #2021

In [None]:
def model_eval(clf, X_test=X_test, y_test=y_test):
    y_pred = clf.predict(X_test)
    r = permutation_importance(clf, X_test, y_test,
                               n_repeats=30,
                               random_state=0)
    
    for i in r.importances_mean.argsort()[::-1]:
        print(f"{X_test.columns[i]:<40} {r.importances_mean[i]:.4f} +/- {r.importances_std[i]:.4f}")
    
    print("Mean cross-validated score of the best_estimator: {0:.3f}".format(clf.best_score_))
    print("Accuracy on train data: {0:.3f}".format(clf.score(X_train, y_train)))
    print("Accuracy on test data: {0:.3f}".format(clf.score(X_test, y_test)))
    print(metrics.classification_report(y_test, y_pred))
    print("Tuned Model Parameters: {}".format(clf.best_params_))

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
result_list={}
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[2, 10],
    max_depth=[5, 6, 7, 8, 9],
    min_samples_leaf=[1, 10],
    max_leaf_nodes=[10, 20])
dt_gscv = GridSearchCV(DecisionTreeClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
dt_gscv.fit(X_train, y_train)
model_eval(dt_gscv)
result_list['DT'] = dt_gscv.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[8, 16, 20],
    max_depth=[6, 8, 10, 12],
    min_samples_leaf=[8, 12, 18],
    max_leaf_nodes=[10, 20],
    n_estimators=[10,100,200])
rt_gscv = GridSearchCV(RandomForestClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
rt_gscv.fit(X_train, y_train)
model_eval(rt_gscv)
result_list['RF'] = rt_gscv.score(X_test, y_test)

In [None]:
knn_clf = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [2,3, 5, 7, 9, 12, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_gscv = GridSearchCV(knn_clf, param_grid=param_grid)
knn_gscv.fit(X_train, y_train)
model_eval(knn_gscv)
result_list['KNN'] = knn_gscv.score(X_test, y_test)

In [None]:
from xgboost import XGBClassifier
params = {
            'objective':'reg:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 0.01,
            'n_estimators':100
        }
            
            
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params,enable_categorical=True)



# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred))
print("Accuracy on train data: {0:.3f}".format(xgb_clf.score(X_train, y_train)))
result_list['XGBoost'] = xgb_clf.score(X_test, y_test)
r = permutation_importance(xgb_clf, X_test, y_test,
                               n_repeats=30,
                               random_state=0)
    
for i in r.importances_mean.argsort()[::-1]:
    print(f"{X_test.columns[i]:<40} {r.importances_mean[i]:.4f} +/- {r.importances_std[i]:.4f}")

# Random Undersampling of data

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='majority')
rus.fit(X, y)
X_resampled1, y_resampled1 = rus.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled1, y_resampled1, random_state=42, test_size=0.2) 

In [None]:
from sklearn.metrics import RocCurveDisplay, roc_curve
def model_eval(clf, X_test=X_test, y_test=y_test):
    y_pred = clf.predict(X_test)
    print("Mean cross-validated score of the best_estimator: {0:.3f}".format(clf.best_score_))
    print("Accuracy on train data: {0:.3f}".format(clf.score(X_train, y_train)))
    print("Accuracy on test data: {0:.3f}".format(clf.score(X_test, y_test)))
    print(metrics.classification_report(y_test, y_pred))
    print("Tuned Model Parameters: {}".format(clf.best_params_))

In [None]:
result_list={}
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[2, 10],
    max_depth=[5, 6, 7, 8, 9],
    min_samples_leaf=[1, 10],
    max_leaf_nodes=[10, 20])
dt_gscv = GridSearchCV(DecisionTreeClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
dt_gscv.fit(X_train, y_train)
model_eval(dt_gscv)
result_list['DT'] = dt_gscv.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[2, 10],
    max_depth=[5, 10],
    min_samples_leaf=[1, 10],
    max_leaf_nodes=[10, 20])
rt_gscv = GridSearchCV(RandomForestClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
rt_gscv.fit(X_train, y_train)
model_eval(rt_gscv)
result_list['RF'] = rt_gscv.score(X_test, y_test)

In [None]:
knn_clf = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 12, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_gscv = GridSearchCV(knn_clf, param_grid=param_grid)
knn_gscv.fit(X_train, y_train)
model_eval(knn_gscv)
result_list['KNN'] = knn_gscv.score(X_test, y_test)

In [None]:
from xgboost import XGBClassifier
params = {
            'objective':'reg:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }
            
            
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params,enable_categorical=True)



# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred))
print("Accuracy on train data: {0:.3f}".format(xgb_clf.score(X_train, y_train)))
result_list['XGBoost'] = xgb_clf.score(X_test, y_test)

# Random Oversampling of data

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
ros.fit(X, y)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42, test_size=0.2)

In [None]:
from sklearn.metrics import RocCurveDisplay, roc_curve
def model_eval(clf, X_test=X_test, y_test=y_test):
    y_pred = clf.predict(X_test)
    print("Mean cross-validated score of the best_estimator: {0:.3f}".format(clf.best_score_))
    print("Accuracy on train data: {0:.3f}".format(clf.score(X_train, y_train)))
    print("Accuracy on test data: {0:.3f}".format(clf.score(X_test, y_test)))
    print(metrics.classification_report(y_test, y_pred))
    print("Tuned Model Parameters: {}".format(clf.best_params_))

In [None]:
result_list={}
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[2, 10],
    max_depth=[5, 6, 7, 8, 9],
    min_samples_leaf=[1, 10],
    max_leaf_nodes=[10, 20])
dt_gscv = GridSearchCV(DecisionTreeClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
dt_gscv.fit(X_train, y_train)
model_eval(dt_gscv)
result_list['DT'] = dt_gscv.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid = dict(
    criterion=['gini', 'entropy'],
    min_samples_split=[2, 10],
    max_depth=[5, 10],
    min_samples_leaf=[1, 10],
    max_leaf_nodes=[10, 20])
rt_gscv = GridSearchCV(RandomForestClassifier(random_state=0), 
                       param_grid, scoring='accuracy')
rt_gscv.fit(X_train, y_train)
model_eval(rt_gscv)
result_list['RF'] = rt_gscv.score(X_test, y_test)

In [None]:
knn_clf = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 12, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_gscv = GridSearchCV(knn_clf, param_grid=param_grid)
knn_gscv.fit(X_train, y_train)
model_eval(knn_gscv)
result_list['KNN'] = knn_gscv.score(X_test, y_test)

In [None]:
from xgboost import XGBClassifier
params = {
            'objective':'reg:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }
            
            
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params,enable_categorical=True)



# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

# Second-tier regressor to predict indirect cost

In [None]:
#One hot encoding
categorical_cols = ['Direct_insurance_cost_total_category_code','Total_loss_cost_classification_code']
numerical_cols = ['Human_damage_death','Human_damage_injuries','Direct_insurance_costs']
target_col = 'Indirect_insurance_costs'
X = df[categorical_cols + numerical_cols]
y = df[target_col]
from collections import Counter
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['Total_loss_cost_classification_code','Direct_insurance_cost_total_category_code'])
X = encoder.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler 
ss_X = StandardScaler()
ss_y = StandardScaler()

X_train = ss_X.fit_transform(X_train)
X_test = ss_X.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import QuantileTransformer
regressor = DecisionTreeRegressor(random_state=42)
scaler = StandardScaler()

# Apply TransformedTargetRegressor
model = TransformedTargetRegressor(regressor=regressor, transformer=QuantileTransformer(output_distribution='normal'))

# Fit and predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate
print("R² score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
result = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=0)

# Display importances
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X.columns[i]:<30} {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Decalring numpy array variable

xAxis = np.arange(y_pred.shape[0])

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 15})
plt.ticklabel_format(style='plain', axis='y')
plt.title("Decision Tree Regressor Result")
plt.xlabel("Test sample")
plt.ylabel("Indirect cost (US$)")
plt.plot(xAxis, y_pred, color ="red")
plt.plot(xAxis, y_test, color ="black",linestyle='dotted')
plt.legend(['prediction', 'actual']);
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
import xgboost as xg
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer

regr_trans = TransformedTargetRegressor(regressor=lgbm.LGBMRegressor(), transformer=QuantileTransformer(output_distribution='normal'))
regr_trans.fit(X_train, y_train)
yhat = regr_trans.predict(X_test)
print(round(r2_score(y_test, yhat),3), round(mean_absolute_error(y_test, yhat),2), round(np.sqrt(mean_squared_error(y_test, yhat)),2))
result = permutation_importance(regr_trans, X_test, y_test, n_repeats=30, random_state=0)

# Display importances
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X.columns[i]:<30} {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Decalring numpy array variable

xAxis = np.arange(yhat.shape[0])

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 15})
plt.ticklabel_format(style='plain', axis='y')
plt.title("LGBM Regressor Result")
plt.xlabel("Test sample")
plt.ylabel("Indirect cost (US$)")
plt.plot(xAxis, yhat, color ="red")
plt.plot(xAxis, y_test, color ="black",linestyle='dotted')
plt.legend(['prediction', 'actual']);
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
import xgboost as xg
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

regr_trans = TransformedTargetRegressor(regressor=GradientBoostingRegressor(), transformer=QuantileTransformer(output_distribution='normal'))
regr_trans.fit(X_train, y_train)
yhat = regr_trans.predict(X_test)
print(round(r2_score(y_test, yhat),3), round(mean_absolute_error(y_test, yhat),2), round(np.sqrt(mean_squared_error(y_test, yhat)),2))
result = permutation_importance(regr_trans, X_test, y_test, n_repeats=30, random_state=0)

# Display importances
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X.columns[i]:<30} {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Decalring numpy array variable

xAxis = np.arange(yhat.shape[0])

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 15})
plt.ticklabel_format(style='plain', axis='y')
plt.title("GB Regressor Result")
plt.xlabel("Test sample")
plt.ylabel("Indirect cost (US$)")
plt.plot(xAxis, yhat, color ="red")
plt.plot(xAxis, y_test, color ="black",linestyle='dotted')
plt.legend(['prediction', 'actual']);
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
## Define Grid 
model = RandomForestRegressor()
#transforming target variable through quantile transformer
ttr = TransformedTargetRegressor(regressor=model, transformer=QuantileTransformer(output_distribution='normal'))
ttr.fit(X_train, y_train)
yhat = ttr.predict(X_test)
print(r2_score(y_test, yhat), mean_absolute_error(y_test, yhat), np.sqrt(mean_squared_error(y_test, yhat)))
result = permutation_importance(ttr, X_test, y_test, n_repeats=30, random_state=0)

# Display importances
for i in result.importances_mean.argsort()[::-1]:
    print(f"{X.columns[i]:<30} {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Decalring numpy array variable

xAxis = np.arange(yhat.shape[0])

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 15})
plt.ticklabel_format(style='plain', axis='y')
plt.title("RF Regressor Result")
plt.xlabel("Test sample")
plt.ylabel("Indirect cost (US$)")
plt.plot(xAxis, yhat, color ="red")
plt.plot(xAxis, y_test, color ="black",linestyle='dotted')
plt.legend(['prediction', 'actual']);
plt.show()