## Imports

In [1]:
import pandas as pd
import numpy as np

In [25]:
from sklearn.preprocessing import StandardScaler

In [75]:
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
train = pd.read_csv("../data/process/train_EDA.csv")
test = pd.read_csv("../data/process/test_EDA.csv")

In [4]:
train.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,max_torque_Nm,max_torque_rpm,max_power_bhp,max_power_rpm
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,False,False,False,True,0,False,60.0,3500.0,40.36,6000.0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,False,False,False,True,0,False,60.0,3500.0,40.36,6000.0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,False,False,False,True,0,False,60.0,3500.0,40.36,6000.0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,True,True,True,True,2,False,113.0,4400.0,88.5,6000.0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,False,True,True,True,2,False,91.0,4250.0,67.06,5500.0


In [5]:
test.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,max_torque_Nm,max_torque_rpm,max_power_bhp,max_power_rpm
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,True,False,False,False,True,0,60.0,3500.0,40.36,6000.0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,True,True,True,True,True,2,113.0,4400.0,88.5,6000.0
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,True,False,True,True,True,2,91.0,4250.0,67.06,5500.0
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,True,False,False,False,True,0,60.0,3500.0,40.36,6000.0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,...,True,False,False,False,True,0,60.0,3500.0,40.36,6000.0


In [62]:
train["make"] = train["make"].astype("category")
test["make"] = test["make"].astype("category")

## Creating dummy features

In [63]:
train_dummies = pd.get_dummies(train.drop(columns=["policy_id"]))
test_dummies = pd.get_dummies(test.drop(columns=["policy_id"]))

In [64]:
print(train_dummies.shape)
print(test_dummies.shape)

(58592, 119)
(39063, 118)


## Standarization

### Subset datasets in numeric and non-numeric

In [65]:
train_dummies_numeric = train_dummies.select_dtypes(include=["int", "float"])
train_dummies_non_numeric = train_dummies.select_dtypes(exclude=["int", "float"])

In [66]:
test_dummies_numeric = test_dummies.select_dtypes(include=["int", "float"])
test_dummies_non_numeric = test_dummies.select_dtypes(exclude=["int", "float"])

### Instantiate scaler

In [67]:
scaler = StandardScaler()

### Fit and transform training data

In [68]:
train_dummies_numeric_scaled = scaler.fit_transform(train_dummies_numeric)

### Join with non-numeric training data

In [73]:
train_dummies_numeric_scaled = pd.DataFrame(train_dummies_numeric_scaled, columns=train_dummies_numeric.columns)
train_pre = pd.concat([train_dummies_non_numeric, train_dummies_numeric_scaled], axis=1)
train_pre.shape

(58592, 119)

### Transform testing data

In [71]:
test_dummies_numeric_scaled = scaler.transform(test_dummies_numeric)

### Join with non-numeric testing data

In [74]:
test_dummies_numeric_scaled = pd.DataFrame(test_dummies_numeric_scaled, columns=test_dummies_numeric.columns)
test_pre = pd.concat([test_dummies_non_numeric, test_dummies_numeric_scaled], axis=1)
test_pre.shape

(39063, 118)

## Split data in train and validation sets

The test dataset that I have been working on, does NOT contain labels (my target variable).

To evaluate the models, and tune the hyperparameters, I will split the training data in train and validation subsets.

### Select X (predictor variables) and y (target variable)

In [79]:
X = train_pre.drop(columns=["is_claim"])
y = train_pre["is_claim"]

### Train Test Split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=24)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(43944, 118)
(14648, 118)
(43944,)
(14648,)


## Save data

### Train Test Split Data

In [88]:
X_train.to_csv("../data/final/X_train_split.csv", index=False)
X_test.to_csv("../data/final/X_test_split.csv", index=False)
y_train.to_csv("../data/final/y_train_split.csv", index=False)
y_test.to_csv("../data/final/y_test_split.csv", index=False)

### Save original train, for cross-validation

In [89]:
train_pre.to_csv("../data/final/train_preprocessed.csv", index=False)

### Save original test, for final predictions

In [90]:
test_pre.to_csv("../data/final/test_preprocessed.csv", index=False)