First all relevant packages are imported to the notebook

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Next, the dataset is loaded and a quick preview is shown

In [4]:
df = pd.read_csv("data/train.csv")

In [5]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [6]:
#For the categorical values that needed tranformation as explained in the report a transformation is done
#using the map function.
df.Gender = df.Gender.map({"Male":0,"Female":1})
df.Vehicle_Age = df.Vehicle_Age.map({"< 1 Year":0,"1-2 Year":1,"> 2 Years":2})
df.Vehicle_Damage = df.Vehicle_Damage.map({"No":0,"Yes":1})

#For the Region Code and Policy Sales Channel the tranformation is done by assigning the specifical values 
#according to a certain condition
df.Region_Code[df.Region_Code != 28.0] = 0
df.Region_Code[df.Region_Code == 28.0] = 1
df.Policy_Sales_Channel[df.Policy_Sales_Channel != 152.0] = 0
df.Policy_Sales_Channel[df.Policy_Sales_Channel == 152.0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
#The shape method is called so that we can confirm the number of observations and variables
#present in the dataset
df.shape

(381109, 12)

In [8]:
#According to the report and the data ecploration it was found that the Annual Premium variable contained 
#outliers values for 1% of the customers. This is calculated by the quantile method. 
q = df.Annual_Premium.quantile(0.99)

#Having identified the outliers, they are removed
df = df[df.Annual_Premium < q]

In [9]:
#The shape method is called again to confirm the reduction of observations after removing the outliers
df.shape

(377296, 12)

In [11]:
#As it was noted in the data exploration notebook, the response variable is unbalanced
#To address this issue an under/sampling is performed. 

#first the positive and the negative responses are separated and 
#the ratio of positive to negative answers is calculated
df_positive = df[df.Response == 1]
df_negative = df[df.Response == 0]
r = len(df_positive)/len(df_negative)
r

0.13935762861318807

In [12]:
#For this model it is wanted that the ratio mentioned above to be 1.0. 
#The number of observations for the negative responses is then calculated
len_negative_wanted = len(df_positive)

#Knowing the number of observations wanted, a resampling is performed.
negative_proportion_wanted = len_negative_wanted/(len(df_negative))
df_negative = df_negative.sample(frac = negative_proportion_wanted)

In [13]:
#the ratio of resonses is then recalculated. 
r = len(df_positive)/len(df_negative)
r

1.0

In [14]:
#The datasets are appended again and the value count for the response variable is shown
df = df_positive.append(df_negative) 
df.Response.value_counts()

1    46148
0    46148
Name: Response, dtype: int64

In [15]:
#the variables that will be used to train the model are separated again from the reponse variable 
#because a standar scaler will be fit onto them 

df_X = df[["Gender","Age","Driving_License","Region_Code","Previously_Insured","Vehicle_Age","Vehicle_Damage","Annual_Premium","Policy_Sales_Channel","Vintage"]]

In [16]:
df_X.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0,44,1,1.0,0,2,1,40454.0,0.0,217
2,0,47,1,1.0,0,2,1,38294.0,0.0,27
7,1,56,1,1.0,0,1,1,32031.0,0.0,72
10,1,47,1,0.0,0,1,1,47576.0,0.0,46
15,0,37,1,0.0,0,1,1,2630.0,0.0,147


In [17]:
#The sacaler transformation is fit and the result is an array of the variables
sc = StandardScaler()
df_X = sc.fit_transform(df_X)

In [18]:
print(df_X)

[[-0.86540432  0.22528235  0.03939248 ...  0.64454929 -0.55710974
   0.75384503]
 [-0.86540432  0.43454966  0.03939248 ...  0.50769247 -0.55710974
  -1.5201936 ]
 [ 1.15552924  1.06235158  0.03939248 ...  0.11087105 -0.55710974
  -0.9816055 ]
 ...
 [ 1.15552924 -1.37910033  0.03939248 ... -0.36743086  1.79497848
   0.70597053]
 [-0.86540432 -1.16983303  0.03939248 ...  0.97604692  1.79497848
   0.58628429]
 [-0.86540432 -0.40251957  0.03939248 ... -1.75196569 -0.55710974
  -0.89782513]]


In [19]:
df_Y = df.Response.to_numpy()

In [20]:
#the response variable is also cnverted to an array.
df_Y

array([1, 1, 1, ..., 0, 0, 0])

In [22]:
#Using the train test split function imported from sklearn
#the train and test datasets are created by a random stratified sample.
#the test set is 10% of the dataset. 
X_train,X_test,y_train,y_test = train_test_split(df_X,df_Y,test_size=.10,random_state=1,stratify=df_Y)

In [23]:
#The test sets are separated. The test(X) set will be uploaded to s3 to be used for predictions
#While the test_y is saved to compare the predictions later on. 
test_y = pd.DataFrame(y_test)
test = pd.DataFrame(X_test)

#the train datasets are merged into a single set, with the response in the first column as it is needed 
#by the xgb algorithm.
train = pd.concat([pd.DataFrame(y_train),pd.DataFrame(X_train)],axis = 1)


#Out of the train dataset a 20% proportion is taken as a validation dataset. This one will be provided
#directly to the algorithm so it can minimize the training. 
n = round(len(train)*0.2)
validation = train[:n]
train = train[n:]

In [26]:
#Finally, all datasets are saved into the local directory.
test.to_csv("processed_data_2/test.csv", header=False, index=False)
validation.to_csv("processed_data_2/validation.csv", header=False, index=False)
train.to_csv("processed_data_2/train.csv", header=False, index=False)
test_y.to_csv("processed_data_2/test_y.csv", header=False, index=False)