In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets, metrics, tree
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

%load_ext nb_black

<IPython.core.display.Javascript object>

In this notebook, we will first try to find for each model the best parameters and the best accuracy too. We will do this to find out which model has the best accurancy with our dataset

In [4]:
df = pd.read_csv('./data/final.csv')
df.set_index('ID', inplace=True)
Y = df['Target']
df.drop('Target', inplace=True, axis=1)
X = df
X

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Mean_Payment_rate,Rate,Mean_Pay
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,689.0,0.0,0.0,0.0,0.0,0.800000,0.195650,-0.4
2,120000.0,2,2,2,26,-1,2,0,0,0,...,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,0.193583,0.022350,0.6
3,90000.0,2,2,2,34,0,0,0,0,0,...,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0.083968,0.324878,0.0
4,50000.0,2,2,1,37,0,0,0,0,0,...,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0.039794,0.939800,0.0
5,50000.0,1,2,1,57,-1,0,-1,0,0,...,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0.471997,0.172340,-0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000.0,1,3,1,39,0,0,0,0,0,...,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0.121471,0.858855,0.0
29997,150000.0,1,3,2,43,-1,-1,-1,-1,0,...,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0.807750,0.011220,-0.8
29998,30000.0,1,2,2,37,4,3,2,-1,0,...,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,0.272225,0.118833,1.6
29999,80000.0,1,3,1,41,1,-1,0,0,0,...,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,0.481511,-0.020563,-0.2


Before everything we have to train our dataset

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=41)

Ok now since our training is completed lets create a parameters dictionary for each model that we want to use:

In [10]:
param_rf = {
    'n_estimators' : [100, 200, 400],
    'max_depth': [4, 8, 12],
    'min_samples_leaf': [0.1, 1, 3, 5],
    'max_features' : ['log2', 'sqrt', None]
}

param_knn = {
    'n_neighbors' : [1, 4, 6],
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size' : [10, 30, 59]
}

param_tree = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [4, 8, 12],
    'min_samples_leaf' : [1, 3, 5],
    'max_features' : ['auto', 'sqrt', None]
}

param_lr = {
    'penalty' : ['l1', 'l2', 'elasticnet', None],
    'C' : [0.1, 0.5, 1, 2],
    'fit_intercept': [True, False],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

param_mlp = {
    'solver': ['lbfgs', 'sgd', 'adam'],
    'max_iter': [100, 200, 300, 400 ],
    'alpha' : [0.0001, 0.00001, 0.000001],
    'hidden_layer_sizes' : [10, 50, 100],
    'random_state' : [0,1,2,3,4]
}
params = [param_rf, param_knn, param_tree, param_lr, param_mlp]
model = [ RandomForestClassifier(), KNeighborsClassifier(), tree.DecisionTreeClassifier(), LogisticRegression(), MLPClassifier()]
name = ['Random Forest', 'KNN', 'Decision Tree', 'Logistic', 'MLP']

We have created our parameters so now lets create our GridSearchCV and print each result's best

In [11]:
for i in range(5):    
    grid = GridSearchCV(estimator= model[i],
                            param_grid= params[i],
                            cv= 3,
                            scoring= 'accuracy',
                            n_jobs= -1)
    result = grid.fit(X_train, y_train)
    print('{} : {}'.format(name[i], result.best_params_))
    print('{} : {}'.format(name[i], result.best_estimator_))
    print('{} : {}'.format(name[i], result.best_score_))

Random Forest : {'max_depth': 4, 'max_features': None, 'min_samples_leaf': 5, 'n_estimators': 200}
Random Forest : RandomForestClassifier(max_depth=4, max_features=None, min_samples_leaf=5,
                       n_estimators=200)
Random Forest : 0.8201904761904762
KNN : {'algorithm': 'kd_tree', 'leaf_size': 10, 'n_neighbors': 6, 'weights': 'uniform'}
KNN : KNeighborsClassifier(algorithm='kd_tree', leaf_size=10, n_neighbors=6)
KNN : 0.7713809523809524
Decision Tree : {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 3}
Decision Tree : DecisionTreeClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=3)
Decision Tree : 0.8182857142857142
Logistic : {'C': 0.1, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'newton-cg'}
Logistic : LogisticRegression(C=0.1, solver='newton-cg')
Logistic : 0.808952380952381
MLP : {'alpha': 0.0001, 'hidden_layer_sizes': 50, 'max_iter': 100, 'random_state': 3, 'solver': 'sgd'}
MLP : MLPClassifier(hidden_layer_sizes=50

So like we have just seen the best one for this model and these parameters is Random Forest with 82% of accuracy with max_depth = 4, max_features = None, min_samples_leaf = 5, n_estimators = 200