In [2]:
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [30]:
df_train = pd.read_csv("train (version 2).csv")
df_test = pd.read_csv("test.csv")

df_train.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,0,0,0,1,0,0,0,1,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,0,0,0,1,0,0,0,1,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,0,0,0,1,0,0,0,1,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,1,1,1,1,1,1,1,1,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,0,1,1,1,0,1,1,1,2,0


In [None]:
column_labels = ["policy_id", "policy_tenure", "age_of_car", "age_of_policyholder", "area_cluster", "population_density",
                 "make", "segment", "model", "fuel_type", "max_torque", "max_power", "engine_type", "airbags", "is_esc",
                "is_adjustable_steering", "is_tpms", "is_parking_sensors", "is_parking_camera", "rear_brakes_type", "displacement",
                "cylinder", "transmission_type", "gear_box", "steering_type", "turning_radius", "length", "width", "height",
                "gross_weight", "is_front_fog_lights", "is_rear_window_wiper", "is_rear_window_washer", "is_rear_window_defogger",
                "is_brake_assist", "is_power_door_locks", "is_central_locking", "is_power_steering", "is_driver_seat_height_adjustable",
                "is_day_night_rear_view_mirror", "is_ecw", "is_speed_alert", "ncap_rating", "is_claim"]

In [8]:
df_train.corr(method='pearson')['is_claim'].sort_values(ascending=False)

is_claim                            1.000000
policy_tenure                       0.078747
age_of_policyholder                 0.022435
is_adjustable_steering              0.013917
cylinder                            0.013434
is_front_fog_lights                 0.011825
is_brake_assist                     0.010893
is_driver_seat_height_adjustable    0.010686
width                               0.009947
is_parking_sensors                  0.008419
is_day_night_rear_view_mirror       0.007989
max_power (bhp)                     0.007698
displacement                        0.007678
is_ecw                              0.006637
is_central_locking                  0.006637
is_power_door_locks                 0.006637
length                              0.006495
rpm_max_torque                      0.004781
max_torque (Nm)                     0.004294
gross_weight                        0.003894
ncap_rating                         0.003800
is_esc                              0.002995
airbags   

In [9]:
df_train_d = df_train.drop(['is_claim'], axis=1)

t_train = df_train[['is_claim']].copy()

df_train.head(), t_train.head()

(  policy_id  policy_tenure  age_of_car  age_of_policyholder area_cluster  \
 0   ID00001       0.515874        0.05             0.644231           C1   
 1   ID00002       0.672619        0.02             0.375000           C2   
 2   ID00003       0.841110        0.02             0.384615           C3   
 3   ID00004       0.900277        0.11             0.432692           C4   
 4   ID00005       0.596403        0.11             0.634615           C5   
 
    population_density  make segment model fuel_type  ...  is_brake_assist  \
 0                4990     1       A    M1       CNG  ...                0   
 1               27003     1       A    M1       CNG  ...                0   
 2                4076     1       A    M1       CNG  ...                0   
 3               21622     1      C1    M2    Petrol  ...                1   
 4               34738     2       A    M3    Petrol  ...                0   
 
    is_power_door_locks  is_central_locking  is_power_steering  \


In [13]:
int_encoder = OrdinalEncoder() #plan to encode categorical data with integer encoding

#each of these attributes will have a numerical representation for each different label (e.g. yes = 0, no = 1)
cat_attribs = ['area_cluster', 'segment', 'model', 'fuel_type', 'engine_type', 'transmission_type',
              'rear_brakes_type', 'steering_type', 'is_speed_alert']

num_attribs = ['policy_tenure', 'age_of_car', 'age_of_policyholder', 'population_density', 'make',
               'max_torque (Nm)', 'rpm_max_torque', 'rpm_max_power', 'max_power (bhp)', 'airbags',
              'displacement', 'cylinder', 'gear_box', 'turning_radius', 'length', 'width', 'height', 'gross_weight',
              'is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors', 'is_parking_camera',
              'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer', 'is_rear_window_defogger',
              'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering', 'is_driver_seat_height_adjustable',
              'is_day_night_rear_view_mirror', 'is_ecw', 'ncap_rating']

data_cats = df_train_d[cat_attribs]

num_pipeline = Pipeline([('mm_scaler', MinMaxScaler())])

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),
                                   ('cat', int_encoder, cat_attribs)])

train_set = full_pipeline.fit_transform(df_train_d)

data_int = int_encoder.fit_transform(data_cats)

int_encoder.categories_

[array(['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
        'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C3', 'C4', 'C5', 'C6',
        'C7', 'C8', 'C9'], dtype=object),
 array(['A', 'B1', 'B2', 'C1', 'C2', 'Utility'], dtype=object),
 array(['M1', 'M10', 'M11', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'],
       dtype=object),
 array(['CNG', 'Diesel', 'Petrol'], dtype=object),
 array(['1.0 SCe', '1.2 L K Series Engine', '1.2 L K12N Dualjet',
        '1.5 L U2 CRDi', '1.5 Turbocharged Revotorq',
        '1.5 Turbocharged Revotron', 'F8D Petrol Engine', 'G12B',
        'K Series Dual jet', 'K10C', 'i-DTEC'], dtype=object),
 array(['Automatic', 'Manual'], dtype=object),
 array(['Disc', 'Drum'], dtype=object),
 array(['Electric', 'Manual', 'Power'], dtype=object),
 array(['1', 'No'], dtype=object)]

In [14]:
train_final = pd.DataFrame(train_set, columns=np.hstack(((num_attribs), (cat_attribs))),
                          index = df_train_d.index)
                           
train_final

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,max_torque (Nm),rpm_max_torque,rpm_max_power,max_power (bhp),airbags,...,ncap_rating,area_cluster,segment,model,fuel_type,engine_type,transmission_type,rear_brakes_type,steering_type,is_speed_alert
0,0.368130,0.05,0.500000,0.064260,0.00,0.000000,0.660377,1.000000,0.000000,0.2,...,0.0,0.0,0.0,0.0,0.0,6.0,1.0,1.0,2.0,0.0
1,0.480580,0.02,0.121622,0.365231,0.00,0.000000,0.660377,1.000000,0.000000,0.2,...,0.0,11.0,0.0,0.0,0.0,6.0,1.0,1.0,2.0,0.0
2,0.601457,0.02,0.135135,0.051764,0.00,0.000000,0.660377,1.000000,0.000000,0.2,...,0.0,15.0,0.0,0.0,0.0,6.0,1.0,1.0,2.0,0.0
3,0.643904,0.11,0.202703,0.291660,0.00,0.278947,1.000000,1.000000,0.617179,0.2,...,0.4,16.0,3.0,3.0,2.0,2.0,0.0,1.0,0.0,0.0
4,0.425902,0.11,0.486486,0.470987,0.25,0.163158,0.943396,0.791667,0.342308,0.2,...,0.4,17.0,0.0,4.0,2.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58587,0.252782,0.13,0.500000,0.116270,0.25,0.163158,0.943396,0.791667,0.342308,0.2,...,0.4,20.0,0.0,4.0,2.0,0.0,0.0,1.0,0.0,0.0
58588,0.858671,0.02,0.324324,0.102516,0.00,0.000000,0.660377,1.000000,0.000000,0.2,...,0.0,5.0,0.0,0.0,0.0,6.0,1.0,1.0,2.0,0.0
58589,0.831862,0.05,0.229730,0.470987,0.00,0.000000,0.660377,1.000000,0.000000,0.2,...,0.0,17.0,0.0,0.0,0.0,6.0,1.0,1.0,2.0,0.0
58590,0.884975,0.14,0.378378,0.116270,0.00,0.278947,1.000000,1.000000,0.617179,0.2,...,0.4,20.0,2.0,7.0,2.0,8.0,1.0,1.0,0.0,0.0


In [21]:
df_test.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,0,0,0,0,1,0,0,0,1,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,0,1,1,1,1,1,1,1,1,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,0,0,1,1,1,0,1,1,1,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,0,0,0,0,1,0,0,0,1,0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,...,0,0,0,0,1,0,0,0,1,0


In [22]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39063 entries, 0 to 39062
Data columns (total 45 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_id                         39063 non-null  object 
 1   policy_tenure                     39063 non-null  float64
 2   age_of_car                        39063 non-null  float64
 3   age_of_policyholder               39063 non-null  float64
 4   area_cluster                      39063 non-null  object 
 5   population_density                39063 non-null  int64  
 6   make                              39063 non-null  int64  
 7   segment                           39063 non-null  object 
 8   model                             39063 non-null  object 
 9   fuel_type                         39063 non-null  object 
 10  max_torque (Nm)                   39063 non-null  float64
 11  rpm_max_torque                    39063 non-null  int64  
 12  max_

In [31]:
int_encoder2 = OrdinalEncoder()

data_cats2 = df_test[cat_attribs]

num_pipeline2 = Pipeline([('mm_scaler', MinMaxScaler())])

full_pipeline2 = ColumnTransformer([('num', num_pipeline, num_attribs),
                                   ('cat', int_encoder, cat_attribs)])

test_set2 = full_pipeline2.fit_transform(df_test)

data_int2 = int_encoder2.fit_transform(data_cats)

int_encoder2.categories_

[array(['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
        'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C3', 'C4', 'C5', 'C6',
        'C7', 'C8', 'C9'], dtype=object),
 array(['A', 'B1', 'B2', 'C1', 'C2', 'Utility'], dtype=object),
 array(['M1', 'M10', 'M11', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9'],
       dtype=object),
 array(['CNG', 'Diesel', 'Petrol'], dtype=object),
 array(['1.0 SCe', '1.2 L K Series Engine', '1.2 L K12N Dualjet',
        '1.5 L U2 CRDi', '1.5 Turbocharged Revotorq',
        '1.5 Turbocharged Revotron', 'F8D Petrol Engine', 'G12B',
        'K Series Dual jet', 'K10C', 'i-DTEC'], dtype=object),
 array(['Automatic', 'Manual'], dtype=object),
 array(['Disc', 'Drum'], dtype=object),
 array(['Electric', 'Manual', 'Power'], dtype=object),
 array([0, 1], dtype=int64)]

In [None]:
test_final = pd.DataFrame(test_set, columns=np.hstack(((num_attribs), (cat_attribs))),
                          index = df_train_d.index)
                           
test_final