In [28]:
# Classification:
    # Naive Bayes
    # Decision Tree
    # Random Forest
    # Lositic Regression
    # Support Vector Machines (or, Support Vector Classification [SVC])
    # Neural Network/Perceptron

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import statsmodels.imputation.mice as mice
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier as KC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

data = pd.read_csv('Car_Insurance_Claim.csv')

#I put the following two lines here to make life easier when testing the code
data.drop(['ID'], axis=1, inplace=True)
data.drop(['POSTAL_CODE'], axis=1, inplace=True)

data.info()

data

for x in data.columns:
    if data[x].dtype == 'object':
        print(data[x].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AGE                  10000 non-null  object 
 1   GENDER               10000 non-null  object 
 2   RACE                 10000 non-null  object 
 3   DRIVING_EXPERIENCE   10000 non-null  object 
 4   EDUCATION            10000 non-null  object 
 5   INCOME               10000 non-null  object 
 6   CREDIT_SCORE         9018 non-null   float64
 7   VEHICLE_OWNERSHIP    10000 non-null  int64  
 8   VEHICLE_YEAR         10000 non-null  object 
 9   MARRIED              10000 non-null  int64  
 10  CHILDREN             10000 non-null  int64  
 11  ANNUAL_MILEAGE       9043 non-null   float64
 12  VEHICLE_TYPE         10000 non-null  object 
 13  SPEEDING_VIOLATIONS  10000 non-null  int64  
 14  DUIS                 10000 non-null  int64  
 15  PAST_ACCIDENTS       10000 non-null  

In [29]:
#Data Prep: Impute Missing Data Values using the MICE Imputer from statsmodels

data['AGE'].replace({ '65+' : 3, '16-25' : 0, '26-39' : 1, '40-64' : 2 }, inplace=True)
data['GENDER'].replace({'female' : 0, 'male' : 1}, inplace=True)
data['RACE'].replace({'majority' : 0, 'minority' : 1}, inplace=True)
data['DRIVING_EXPERIENCE'].replace({'0-9y' : 0, '10-19y' : 1, '20-29y' : 2, '30y+' : 3}, inplace=True)
data['EDUCATION'].replace({'high school' : 1, 'none' : 0, 'university' : 2}, inplace=True)
data['INCOME'].replace({'upper class' : 3, 'poverty' : 0, 'working class' : 1, 'middle class' : 2}, inplace=True)
data['VEHICLE_YEAR'].replace({'after 2015' : 1, 'before 2015' : 0}, inplace=True)
data['VEHICLE_TYPE'].replace({'sedan' : 0, 'sports car' : 1}, inplace=True)

imp = mice.MICEData(data)
imp.set_imputer('CREDIT_SCORE')
for x in range(0,5):
    imp.update_all()
    if x == 4:
        data = imp.data

data['AGE'].replace({ 3 : '65+' , 0 : '16-25', 1 : '26-39', 2 : '40-64' }, inplace=True)
data['GENDER'].replace({ 0 : 'female', 1 : 'male' }, inplace=True)
data['RACE'].replace({0 : 'majority', 1 : 'minority'}, inplace=True)
data['DRIVING_EXPERIENCE'].replace({ 0 :'0-9y', 1 : '10-19y', 2 : '20-29y', 3 : '30y+'}, inplace=True)
data['EDUCATION'].replace({ 1 : 'high school', 0 : 'none', 2 : 'university'}, inplace=True)
data['INCOME'].replace({3 : 'upper class', 0: 'poverty', 1 : 'working class', 2 : 'middle class'}, inplace=True)
data['VEHICLE_YEAR'].replace({ 1 :'after 2015', 0 : 'before 2015'}, inplace=True)
data['VEHICLE_TYPE'].replace({ 0 :'sedan', 1 : 'sports car'}, inplace=True)        

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AGE                  10000 non-null  object 
 1   GENDER               10000 non-null  object 
 2   RACE                 10000 non-null  object 
 3   DRIVING_EXPERIENCE   10000 non-null  object 
 4   EDUCATION            10000 non-null  object 
 5   INCOME               10000 non-null  object 
 6   CREDIT_SCORE         10000 non-null  float64
 7   VEHICLE_OWNERSHIP    10000 non-null  int64  
 8   VEHICLE_YEAR         10000 non-null  object 
 9   MARRIED              10000 non-null  int64  
 10  CHILDREN             10000 non-null  int64  
 11  ANNUAL_MILEAGE       10000 non-null  float64
 12  VEHICLE_TYPE         10000 non-null  object 
 13  SPEEDING_VIOLATIONS  10000 non-null  int64  
 14  DUIS                 10000 non-null  int64  
 15  PAST_ACCIDENTS       10000 non-null  

In [30]:
# Split the data into X and Y
all_features = data.loc[:, data.columns != 'OUTPUT']
outcome = data[['OUTCOME']].values

# Build the Column Transformer
ct = ColumnTransformer([('OrdEncode', OrdinalEncoder(), ['AGE', 'INCOME', 'DRIVING_EXPERIENCE']), ('OHEncode', OneHotEncoder(), ['GENDER', 'RACE', 'EDUCATION', 'VEHICLE_YEAR', 'VEHICLE_TYPE'])], remainder='passthrough')
# data_ct.output_indices_

data_ct = ct.fit_transform(all_features)
# Scaling the data 
scaler = StandardScaler()
all_features_scaled = scaler.fit_transform(data_ct)

# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(all_features_scaled, outcome, test_size=.33)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [31]:
#Naive_Bayes
scaler = MinMaxScaler()
all_features_minmax = scaler.fit_transform(data_ct)
X_train_minmax, X_test_minmax, y_train_minmax, y_test_minmax = train_test_split(all_features_minmax, outcome, test_size=.33, random_state=1)
classifier = MultinomialNB()

y_train_minmax = np.ravel(y_train_minmax)
y_test_minmax = np.ravel(y_test_minmax)

NB_cv_score = cross_val_score(classifier, X_train_minmax, y_train_minmax, cv=10)

NB_cv_score.mean()

0.98955223880597

In [32]:
# Decision Tree Classifiers

DTC = DecisionTreeClassifier(random_state=1)  
DTC.fit(X_train, y_train)
DTC_base_cv_score = cross_val_score(DTC, X_test, y_test, cv=10)
print(DTC_base_cv_score.mean())

1.0


In [33]:
#Random Forest

RFC = RandomForestClassifier(random_state=1)
RFC.fit(X_train, y_train)
RFC_cv_scores = cross_val_score(RFC, X_test, y_test, cv=10)
RFC_cv_scores.mean()

1.0

In [34]:
#KNN

KNN = neighbors.KNeighborsClassifier(n_neighbors=20)
KNN.fit(X_train, y_train)
cv_scores = cross_val_score(KNN, X_test, y_test, cv=10)

cv_scores.mean()

0.9624242424242423

In [35]:
#use a loop to see if the mean of that is a better value

for n in range(2,26):
    KNN = neighbors.KNeighborsClassifier(n_neighbors=n)
    KNN.fit(X_train, y_train)
    KNN_loop_cv_score = cross_val_score(KNN, X_test, y_test, cv=10)
    print(n, KNN_loop_cv_score.mean())

2 0.9606060606060606
3 0.9724242424242424
4 0.9672727272727272
5 0.9739393939393939
6 0.97
7 0.9727272727272729
8 0.9684848484848485
9 0.9703030303030303
10 0.9672727272727272
11 0.9687878787878788
12 0.966969696969697
13 0.9690909090909091
14 0.9642424242424242
15 0.9660606060606062
16 0.9633333333333333
17 0.9651515151515151
18 0.9624242424242425
19 0.9648484848484848
20 0.9624242424242423
21 0.9630303030303029
22 0.9615151515151515
23 0.9651515151515152
24 0.9615151515151515
25 0.9624242424242425


In [36]:
#SVM (or in this case, SVC)

#the default 'rbf' kernel

C = 1.0
svc = svm.SVC(kernel='rbf', C=C)
svc.fit(X_train, y_train)
cv_scores = cross_val_score(svc, X_test, y_test, cv=10)
cv_scores.mean()

0.999090909090909

In [37]:
#SVM (or in this case, SVC)

#the linear kernel

C = 1.0
svc = svm.SVC(kernel='linear', C=C)
svc.fit(X_train, y_train)
cv_scores = cross_val_score(svc, X_test, y_test, cv=10)
cv_scores.mean()

1.0

In [38]:
#SVM (or in this case, SVC)

#the poly kernel

C = 1.0
svc = svm.SVC(kernel='poly', C=C)
svc.fit(X_train, y_train)
cv_scores = cross_val_score(svc, X_test, y_test, cv=10)
cv_scores.mean()

1.0

In [39]:
#SVM (or in this case, SVC)

#the sigmoid kernel

C = 1.0
svc = svm.SVC(kernel='sigmoid', C=C)
svc.fit(X_train, y_train)
cv_scores = cross_val_score(svc, X_test, y_test, cv=10)
cv_scores.mean()

0.9803030303030302

In [40]:
# Logistic Regression

LR = LogisticRegression()
LR.fit(X_test, y_test)
cv_scores = cross_val_score(LR, X_test, y_test, cv=10)
cv_scores.mean()

1.0

In [41]:
# Neural Network

# Create the NN model
def create_model():
    model = Sequential()
    model.add(Dense(8, kernel_initializer='normal', input_dim=23, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Calculate the estimator
estimator = KC(build_fn=create_model, epochs=100, verbose=0)

# Score it and take the mean
cv_scores = cross_val_score(estimator, X_test, y_test, cv=10)
cv_scores.mean()

1.0