## **<span style="color:orange;">Import Libraries</span>**

In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from feature_engine.transformation import LogTransformer, BoxCoxTransformer, YeoJohnsonTransformer
import math
import sklearn
import warnings
from feature_engine.selection import DropConstantFeatures
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import PowerTransformer

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

from sklearn.metrics import (
    mean_pinball_loss,
    d2_pinball_score
)

In [235]:
pd.set_option('display.max_columns', None)
sklearn.set_config(transform_output='pandas')
warnings.filterwarnings("ignore")

## **<span style="color:orange;">Reading Data</span>**

In [236]:
# Function to read data from source
def get_data():
    BASE_DIR = Path.cwd().parents[0]
    file_name = 'claims-severity-anonymized-raw.csv'
    return pd.read_csv(BASE_DIR / 'data' / 'allstate-claims-severity' / file_name, on_bad_lines="skip")

In [237]:
# Reading data from source
df = get_data()

In [238]:
# Applying Data Cleaning Steps
df = df.dropna(axis=0, ignore_index=True).drop('id', axis=1)

In [239]:
# Preview of data
df.head(5)

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat94,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,A,B,A,B,A,A,A,A,B,A,B,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,D,B,B,D,D,B,D,C,B,D,B,A,A,A,A,A,D,B,C,E,A,C,T,B,G,A,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,A,B,A,A,A,A,A,A,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,D,B,B,D,D,A,B,C,B,D,B,A,A,A,A,A,D,D,C,E,E,D,T,L,F,A,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,0.330514,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,A,B,A,A,B,A,A,A,B,B,B,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,D,B,B,B,D,B,D,C,B,B,B,A,A,A,A,A,D,D,C,E,E,A,D,L,O,A,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,0.261841,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,B,B,A,B,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,D,B,B,D,D,D,B,C,B,D,B,A,A,A,A,A,D,D,C,E,E,D,T,I,D,A,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,0.321594,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,A,B,A,B,A,A,A,A,B,B,A,B,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,D,B,D,B,D,B,B,C,B,B,C,A,A,A,B,H,D,B,D,E,E,A,P,F,J,A,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,0.273204,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [240]:
# List of Categorical Features
cat_cols = df.select_dtypes(include='O').columns.to_list()

# List of Numerical Features
num_cols = df.select_dtypes(include=float).columns.to_list()
num_cols.remove('loss')

print(cat_cols)
print("="*100)
print(num_cols)

['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111

In [241]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [242]:
df[num_cols].head(5)

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843
1,0.330514,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496
2,0.261841,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425
3,0.321594,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642
4,0.273204,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606


##### <span style="color:orange;">Train-Test Split</span>

In [243]:
# Independent varaibles
X = df.drop(
    'loss',
    axis=1
)

# Dependent varaibles
y = df.loss

# Train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [244]:
constant_feature = DropConstantFeatures(
    tol=0.99,
    missing_values='raise'
)

In [245]:
constant_feature.fit_transform(X_train)

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat16,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat36,cat37,cat38,cat39,cat40,cat41,cat43,cat44,cat45,cat49,cat50,cat52,cat53,cat54,cat57,cat65,cat66,cat71,cat72,cat73,cat74,cat75,cat76,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat94,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
57590,A,B,A,A,B,A,A,A,B,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,B,D,B,A,D,B,D,B,D,A,A,B,H,D,D,C,E,A,C,T,F,F,A,B,I,E,F,H,G,BU,DW,A,J,S,A,P,LB,0.894333,0.299102,0.187583,0.789639,0.594196,0.838765,0.557638,0.68823,0.91644,0.83510,0.569745,0.594646,0.861364,0.383475
81907,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,B,A,A,A,A,A,A,B,A,A,A,A,B,D,D,A,A,C,B,C,B,A,A,A,A,A,C,C,E,E,A,E,R,J,A,A,A,G,E,H,I,F,M,AM,A,AK,L,J,P,HJ,0.641430,0.299102,0.230975,0.652072,0.332785,0.539667,0.479658,0.74629,0.66201,0.46672,0.396226,0.387819,0.666708,0.474181
54326,A,B,A,A,A,A,A,A,B,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,D,D,B,B,C,B,D,B,A,A,A,A,A,D,C,C,E,C,D,T,N,F,A,A,F,F,H,J,K,BI,BP,A,E,BM,A,O,GS,0.407260,0.245921,0.613660,0.383428,0.805895,0.503587,0.487817,0.70342,0.39447,0.46119,0.430255,0.519456,0.617748,0.820661
178474,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,B,D,B,B,C,B,D,B,A,A,A,A,A,D,D,C,E,C,D,T,G,A,A,A,G,F,F,K,E,BI,CL,G,E,AT,A,O,HN,0.461344,0.737068,0.692825,0.473202,0.525831,0.748907,0.881095,0.62918,0.55648,0.69471,0.705501,0.692256,0.758902,0.801668
166057,A,B,A,A,A,A,A,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,B,A,A,D,B,D,B,A,C,B,B,D,A,A,A,B,H,C,C,E,E,E,A,S,F,C,C,A,D,E,I,I,I,AB,CL,A,AS,L,A,N,DC,0.299256,0.488789,0.440642,0.844287,0.388783,0.344288,0.293621,0.95332,0.32865,0.26894,0.254180,0.250169,0.453468,0.724998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53224,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,B,D,D,B,B,C,B,C,B,A,A,A,A,A,D,D,E,E,G,E,R,I,A,A,A,G,F,G,I,F,BI,BS,E,E,BM,A,M,HK,0.632041,0.785784,0.692825,0.564742,0.924177,0.662161,0.470084,0.61229,0.64873,0.77634,0.698978,0.692256,0.689974,0.810871
134077,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,B,B,B,A,A,A,A,A,A,A,A,A,B,A,A,A,A,D,B,D,B,B,C,B,D,B,A,A,A,A,A,D,D,C,D,G,D,T,G,A,A,E,K,F,F,I,K,BI,CL,A,J,L,A,P,LN,0.838708,0.245921,0.230975,0.761209,0.281143,0.806033,0.722692,0.24564,0.81945,0.80218,0.757468,0.744640,0.848129,0.389203
149021,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,B,D,D,B,B,C,B,D,B,A,A,A,B,H,D,D,C,E,C,D,T,F,A,A,A,G,G,G,H,K,BI,CQ,A,U,BM,E,L,HK,0.472892,0.681761,0.692825,0.564742,0.594196,0.570733,0.547756,0.80438,0.44352,0.63026,0.705501,0.692256,0.516660,0.439768
20191,A,B,A,A,B,A,A,B,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,B,D,D,D,B,C,B,B,B,A,A,A,B,H,D,D,C,G,E,A,N,F,D,B,B,D,D,I,H,B,BI,CO,A,C,BM,A,L,EL,0.292010,0.737068,0.634224,0.623770,0.281143,0.314937,0.413673,0.67263,0.38249,0.29595,0.228492,0.225288,0.363547,0.771740


In [246]:
print(constant_feature.features_to_drop_)
print(f'Number of constant features: {len(constant_feature.features_to_drop_)}')

['cat15', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat32', 'cat33', 'cat34', 'cat35', 'cat42', 'cat46', 'cat47', 'cat48', 'cat51', 'cat55', 'cat56', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat67', 'cat68', 'cat69', 'cat70', 'cat77', 'cat78']
Number of constant features: 31


In [247]:
feature_selection = Pipeline(
    steps=[
        ('constant_features', DropConstantFeatures(tol=0.99))
    ]
)

In [248]:
X_train = feature_selection.fit_transform(X_train, y_train)
X_test = feature_selection.transform(X_test)

##### <span style="color:orange;">Catboost</span>

In [249]:
cat_cols = X_train.select_dtypes(include='category').columns.to_list()

In [250]:
q = 0.90  # quantile

cat_model  = CatBoostRegressor(
    loss_function=f"Quantile:alpha={q}",
    iterations=200,
    learning_rate=0.05,
    random_seed=42,
    max_depth=5,
    eval_metric=f"Quantile:alpha={q}",
)

cat_model .fit(
    X_train,
    y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test),
    use_best_model=True
)

0:	learn: 645.7316567	test: 633.6348074	best: 633.6348074 (0)	total: 445ms	remaining: 1m 28s
1:	learn: 626.2998855	test: 614.8791650	best: 614.8791650 (1)	total: 931ms	remaining: 1m 32s
2:	learn: 609.0507330	test: 598.5077486	best: 598.5077486 (2)	total: 1.35s	remaining: 1m 28s
3:	learn: 593.2542851	test: 583.7758795	best: 583.7758795 (3)	total: 1.82s	remaining: 1m 29s
4:	learn: 579.4613449	test: 570.5427931	best: 570.5427931 (4)	total: 2.28s	remaining: 1m 29s
5:	learn: 566.4492091	test: 558.1843996	best: 558.1843996 (5)	total: 2.71s	remaining: 1m 27s
6:	learn: 554.2497294	test: 546.6325618	best: 546.6325618 (6)	total: 3.14s	remaining: 1m 26s
7:	learn: 543.6207511	test: 536.7099928	best: 536.7099928 (7)	total: 3.57s	remaining: 1m 25s
8:	learn: 534.5030029	test: 528.0485504	best: 528.0485504 (8)	total: 3.99s	remaining: 1m 24s
9:	learn: 525.6234152	test: 519.7223390	best: 519.7223390 (9)	total: 4.39s	remaining: 1m 23s
10:	learn: 517.2194221	test: 511.5815774	best: 511.5815774 (10)	total:

<catboost.core.CatBoostRegressor at 0x279cd521d50>

In [251]:
def get_metrics(model, X_test, y_test, quantile: float = 0.9):
        # Predictions
        y_pred = model.predict(X_test)

        # Mean pinball loss
        pinball = mean_pinball_loss(
            y_test,
            y_pred,
            alpha=quantile
        )

        # DÂ² pinball score
        d2 = d2_pinball_score(
            y_test,
            y_pred,
            alpha=quantile
        )

        # Coverage
        coverage = (y_test <= y_pred).mean()

        metrics = {
            "pinball_loss": pinball,
            "d2_pinball": d2,
            "coverage": coverage
        }

        return metrics

In [252]:
get_metrics(cat_model , X_test, y_test, quantile=0.9)

{'pinball_loss': 352.39968608735677,
 'd2_pinball': 0.46125067449162815,
 'coverage': np.float64(0.8993999575191164)}

##### <span style="color:orange;">LightGBM</span>

In [253]:
lgb_model  = LGBMRegressor(
    objective="quantile",
    alpha=0.9,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_model .fit(
    X_train,
    y_train,
    categorical_feature=cat_cols
)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'quantile'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [254]:
get_metrics(lgb_model , X_test, y_test, quantile=0.9)

{'pinball_loss': 344.6307400736178,
 'd2_pinball': 0.47312785426804616,
 'coverage': np.float64(0.8893903993203058)}