# Car Insurance Claim Prediction Dataset

The Car Insurance Claim Prediction dataset contains:
- information on policyholders
    - having the attributes like policy tenure, age of the car, age of the car owner, the population density of the city, make and model of the car, power, engine type, etc,...
- and the target variable
    - indicating whether the policyholder files a claim in the next 6 months or not

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import kruskal, chi2_contingency

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report


#from xgboost import XGBClassifier

#from imblearn.over_sampling import SMOTE


In [7]:
df_train_pre = pd.read_csv("train.csv")
df_test_pre = pd.read_csv("test.csv")


In [8]:
df_train_pre.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [9]:
df_test_pre.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0


## Data Preprocessing

The **bin2num()** function, takes in a categorical variable that is expected to contain binary responses represented as strings 'yes' or 'no' and systematically transforms these string values into numerical values, where 'yes' corresponds to 1 and 'no' corresponds to 0.

In [None]:
def bin2num(var):
    df_train_pre[var]= df_train_pre[var].map({"No": 0, "Yes": 1})
    df_test_pre[var] = df_test_pre[var].map({"No": 0, "Yes": 1})

In [None]:
binary_features = [
    "is_esc",
    "is_adjustable_steering",
    "is_tpms",
    "is_parking_sensors",
    "is_parking_camera",
    "is_front_fog_lights",
    "is_rear_window_wiper",
    "is_rear_window_washer",
    "is_rear_window_defogger",
    "is_brake_assist",
    "is_power_door_locks",
    "is_central_locking",
    "is_power_steering",
    "is_driver_seat_height_adjustable",
    "is_day_night_rear_view_mirror",
    "is_ecw",
    "is_speed_alert"
]

for feature in binary_features:
    bin2num(feature)

In [10]:
df_train_pre.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [11]:
categorical_vars = [
    "area_cluster",
    "make",
    "segment",
    "model",
    "fuel_type",
    "engine_type",
    "airbags",
    "is_esc",
    "is_adjustable_steering",
    "is_tpms",
    "is_parking_sensors",
    "is_parking_camera",
    "rear_brakes_type",
    "cylinder",
    "transmission_type",
    "gear_box",
    "steering_type",
    "is_front_fog_lights",
    "is_rear_window_wiper",
    "is_rear_window_washer",
    "is_rear_window_defogger",
    "is_brake_assist",
    "is_power_door_locks",
    "is_central_locking",
    "is_power_steering",
    "is_driver_seat_height_adjustable",
    "is_day_night_rear_view_mirror",
    "is_ecw",
    "is_speed_alert",
    "ncap_rating"
]

In [13]:
chi2_pvalue= []
for var in categorical_vars:
    contingency_table = pd.crosstab(df_train_pre[var], df_train_pre['is_claim'])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    chi2_pvalue.append(p)
#chi2_pvalue.sort(key =lambda x:x[1])



In [16]:
print(chi2_pvalue)

ncap_rating [1.412557511595789e-11, 0.3516519554568966, 0.02738747673628079, 0.02949022219663932, 0.056145343695656484, 0.029490222196639326, 0.733046343840305, 0.47973082296004166, 0.0008041954126638532, 0.8804734284234931, 0.046026600078480424, 0.8464941782641769, 0.8804734284234931, 0.0012196212137077615, 0.8913806451098147, 0.8932484235563578, 0.05680882959490323, 0.004438415801344458, 0.5332800461995457, 0.5332800461995457, 0.5426511057939621, 0.008799083467312552, 0.11234220574905036, 0.11234220574905036, 0.6486012327453723, 0.010185701302166342, 0.055307437479256066, 0.11234220574905036, 0.09668666189604051, 0.8325265632423358]


In [17]:
print("Test  Chi-square p-values:")

for var, p in chi2_pvalue:
    print(f"{var}: {p}")

Test  Chi-square p-values:


TypeError: cannot unpack non-iterable numpy.float64 object

According to the $\chi^2$ test results, the **most significant variables for predicting "is_claim"** appear to be:

- area_cluster
- is_adjustable_steering
- cylinder
- is_front_fog_lights
- is_brake_assist
- is_driver_seat_height_adjustable
- max_torque
- max_power
- segment
- model
- engine_type
- is_parking_sensors
- is_day_night_rear_view_mirror
- fuel_type
- steering_type
- is_speed_alert


On the other hand, variables:


- is_power_door_locks
- is_central_locking
- is_ecw
- make
- is_esc
- is_rear_window_wiper
- is_rear_window_washer
- is_rear_window_defogger
- is_power_steering
- airbags
- ncap_rating
- is_parking_camera
- is_tpms
- rear_brakes_type
- transmission_type
- gear_box

have higher p-values, suggesting they may have less influence on the target variable.

PCA

In [20]:
numerical_vars = [
    "policy_tenure",
    "age_of_car",
    "age_of_policyholder",
    "population_density",
    # "max_torque",
    # "max_power",
    "displacement",
    "turning_radius",
    "length",
    "width",
    "height",
    "gross_weight"
]

In [25]:
scaler = StandardScaler()
numerical_data = df_train_pre[numerical_vars]
numerical_data_Standardized = scaler.fit_transform(numerical_data)
numerical_data_Standardized


array([[-0.23028345, -0.34244685,  1.42255728, ..., -1.40276762,
        -0.98384583, -0.94282842],
       [ 0.14818765, -0.87135928, -0.7683621 , ..., -1.40276762,
        -0.98384583, -0.94282842],
       [ 0.55502223, -0.87135928, -0.69011498, ..., -1.40276762,
        -0.98384583, -0.94282842],
       ...,
       [ 1.33048996, -0.34244685, -0.14238513, ..., -1.40276762,
        -0.98384583, -0.94282842],
       [ 1.50925096,  1.24429045,  0.7183332 , ...,  0.55997282,
        -0.29307842, -0.23668445],
       [-1.17545276, -0.87135928, -0.22063226, ...,  1.05065793,
         1.02565937,  1.57575173]])

In [27]:
pca = PCA()
pca.fit(numerical_data_Standardized)
components_df = pd.DataFrame(pca.components_,columns = numerical_vars)
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio

array([0.50740786, 0.11513607, 0.10038292, 0.08790109, 0.07671975,
       0.07299651, 0.0201212 , 0.01326207, 0.00490931, 0.00116322])

In [28]:
for i in range(len(components_df)):
    print(f"Component {i+1}")
    print(f"Explained Variance Ratio: {explained_variance_ratio[i]:.4f}")
    print("Loading Scores:")
    print(components_df.iloc[i].abs().sort_values(ascending=False))
    print("\n")

Component 1
Explained Variance Ratio: 0.5074
Loading Scores:
length                 0.435129
displacement           0.421910
turning_radius         0.409370
width                  0.402134
gross_weight           0.399924
height                 0.290308
age_of_car             0.209497
policy_tenure          0.110178
population_density     0.054969
age_of_policyholder    0.009573
Name: 0, dtype: float64


Component 2
Explained Variance Ratio: 0.1151
Loading Scores:
policy_tenure          0.667182
age_of_policyholder    0.648638
population_density     0.302775
height                 0.123809
age_of_car             0.117204
gross_weight           0.091644
turning_radius         0.046088
length                 0.037501
displacement           0.027328
width                  0.026884
Name: 1, dtype: float64


Component 3
Explained Variance Ratio: 0.1004
Loading Scores:
population_density     0.835321
age_of_policyholder    0.504129
age_of_car             0.173158
turning_radius         0.0683

The first 6 components explain the following percentage of the total variance:

-   Component 1: 50.74% (acc. 50.74%)
-   Component 2: 11.51% (acc. 62.25%)
-   Component 3: 10.04% (acc. 72.29%)
-   Component 4: 8.79% (acc. 81.08%)
-   Component 5: 7.67% (acc. 88.75%)
-   Component 6: 7.30% (acc. 96.05%)
The sum of these percentages is approximately 96.05%.

Principal variables are:

-   length
-   policy_tenure
-   population_density
-   height
-   age_of_car
-   age_of_policyholder
-   displacement
-   width
-   turning_radius

# Analysis of Variance (ANOVA)

In [29]:
continuous_vars =df_train_pre.select_dtypes(include =["float64", "int64"]).columns.tolist()
continuous_pvalues = {}

for var in continuous_vars:
    model = ols(f"{var}~ C(is_claim)", data=df_train_pre).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    print("ANOVA for variable:", var)
    print(anova_table)
    print("\n")
    p_value = anova_table.loc["C(is_claim)","PR(>F)"]
    continuous_pvalues[var] = p_value


ANOVA for variable: policy_tenure
                  sum_sq       df           F        PR(>F)
C(is_claim)    62.320771      1.0  365.593096  3.018116e-81
Residual     9987.535419  58590.0         NaN           NaN


ANOVA for variable: age_of_car
                 sum_sq       df          F        PR(>F)
C(is_claim)    0.149607      1.0  46.537984  9.072911e-12
Residual     188.351229  58590.0        NaN           NaN


ANOVA for variable: age_of_policyholder
                 sum_sq       df          F        PR(>F)
C(is_claim)    0.445348      1.0  29.505774  5.597168e-08
Residual     884.333865  58590.0        NaN           NaN


ANOVA for variable: population_density
                   sum_sq       df          F    PR(>F)
C(is_claim)  5.795189e+09      1.0  18.586944  0.000016
Residual     1.826767e+13  58590.0        NaN       NaN


ANOVA for variable: make
                   sum_sq       df         F    PR(>F)
C(is_claim)      0.015736      1.0  0.012172  0.912149
Residual     7574

Kruskal-Wallis H Test