In [10]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [11]:
# Load dataset
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')

In [12]:
test_data.shape

(127037, 11)

In [13]:
train_data.sample(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
282976,282977,Female,24,1,11.0,1,< 1 Year,No,27003.0,152.0,160,0
290108,290109,Female,30,1,8.0,1,< 1 Year,No,29590.0,152.0,153,0
130063,130064,Male,63,1,28.0,0,> 2 Years,Yes,35518.0,122.0,173,1
346835,346836,Female,74,1,46.0,0,1-2 Year,Yes,2630.0,156.0,206,0
286235,286236,Female,47,1,28.0,0,1-2 Year,Yes,21204.0,26.0,234,0


In [14]:
train_data.set_index('id', inplace=True)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381109 entries, 1 to 381109
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  object 
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Age           381109 non-null  object 
 6   Vehicle_Damage        381109 non-null  object 
 7   Annual_Premium        381109 non-null  float64
 8   Policy_Sales_Channel  381109 non-null  float64
 9   Vintage               381109 non-null  int64  
 10  Response              381109 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 34.9+ MB


In [16]:
X_train = train_data.drop('Response', axis=1)

y_train = train_data['Response']

In [17]:
# Dummmy values    
cat_columns = train_data.select_dtypes(include=['object']).columns

X_train = pd.get_dummies(
    X_train,
    columns = cat_columns,
    drop_first=True)
    
X_test = pd.get_dummies(
    test_data,
    columns = cat_columns,
    drop_first=True)

In [None]:
# Time to scaling... 
scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(X_train)

x_test = scaler.transform(X_test)

In [18]:
# Find the best model in this problem
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [20]:
models = []

models.append(('LR', LogisticRegression(solver = 'liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

names = []
results = []

for name, model in models:
    kfold = StratifiedKFold(
        n_splits=10,
        random_state=1,
        shuffle=True)
    
    cv_results = cross_val_score(
        model,
        X_train,
        y_train,
        cv = kfold,
        scoring='accuracy')
    
    names.append(name)
    results.append(cv_results)
    print(f"{name} --> mean:{cv_results.mean():.4f}, std: {cv_results.std():.4f}")

LR-> mean:0.8774, std: 0.0000


KeyboardInterrupt: 