In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [4]:
# df = pd.read_parquet('data.parquet')

In [5]:
df = pd.read_parquet('./parquet/kess.parquet')

Drop unnecessary columns

In [6]:
df = df.drop(['ID', 'Dt_Customer'], axis=1).reset_index(drop=True)

## Modelling

In [7]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [8]:
X = df.drop('Response', axis=1)
y = df.Response

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=12)

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
pf = PolynomialFeatures(interaction_only=True)
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [13]:
def assess_cv(model, name):
  accuracy = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
  precision = cross_val_score(model, X_train, y_train, cv=skf, scoring='precision')
  recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
  f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
  roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

  eval = {name: {} for n in range(5)}
  eval[name]['Accuracy'] = accuracy.mean()
  eval[name]['Precision'] = precision.mean()
  eval[name]['Recall'] = recall.mean()
  eval[name]['F1 Score'] = f1.mean()
  eval[name]['ROC AUC'] = roc.mean()
  return pd.DataFrame(eval).T

In [14]:
overview = pd.DataFrame({
  'Accuracy': [],
  'Precision': [],
  'Recall': [],
  'F1 Score': [],
  'ROC AUC': []
})

### Logistic Regression

In [15]:
params = {
  'max_iter': [100, 200, 500, 1000]
}

In [16]:
clf_lr = GridSearchCV(
  estimator=LogisticRegression(),
  param_grid=params,
  scoring='f1',
  cv=skf
)

In [17]:
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
clf_lr.best_estimator_

In [19]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.106655,0.023766,0.002998,0.000633,100,{'max_iter': 100},0.4,0.461538,0.407767,0.4,0.461538,0.426169,0.029018,4
1,0.115001,0.005329,0.002799,0.0004,200,{'max_iter': 200},0.383838,0.461538,0.411765,0.4,0.47619,0.426666,0.035878,1
2,0.176783,0.050006,0.002595,0.000488,500,{'max_iter': 500},0.383838,0.461538,0.411765,0.4,0.47619,0.426666,0.035878,1
3,0.123331,0.005756,0.002597,0.00049,1000,{'max_iter': 1000},0.383838,0.461538,0.411765,0.4,0.47619,0.426666,0.035878,1


In [20]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.832386,0.426666,0.439364,0.779308,0.415094


In [21]:
overview = pd.concat([overview, assess_cv(clf_lr.best_estimator_, 'Logistic Regression')])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### SVM

In [22]:
params = {
  'kernel': ['poly', 'rbf', 'sigmoid'],
  'gamma': ['scale', 'auto']
}

In [23]:
clf_svc = GridSearchCV(
  estimator=SVC(),
  param_grid=params,
  scoring='f1',
  cv=skf
)

In [24]:
clf_svc.fit(X_train, y_train)

In [25]:
clf_svc.best_estimator_

In [26]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.213997,0.019943,0.036193,0.001934,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.09375,0.184615,0.133333,0.123077,0.193548,0.145665,0.037861,6
1,0.207052,0.007784,0.092567,0.00135,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.163934,0.222222,0.253968,0.246154,0.333333,0.243922,0.054716,2
2,0.159879,0.004367,0.024598,0.0008,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.190476,0.176471,0.196721,0.231884,0.242424,0.207595,0.025231,4
3,0.208036,0.006759,0.0356,0.002785,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.205882,0.181818,0.222222,0.179104,0.268657,0.211537,0.032692,3
4,0.207031,0.007017,0.094415,0.001861,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.163934,0.222222,0.28125,0.246154,0.358209,0.254354,0.064474,1
5,0.153633,0.017348,0.026007,0.002098,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.1875,0.2,0.190476,0.181818,0.242424,0.200444,0.021801,5


In [27]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.864773,0.254354,0.733398,0.795804,0.154717


In [28]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes

In [29]:
clf_nb = GaussianNB()

In [30]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.701705,0.373111,0.279711,0.683997,0.584906


In [31]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree

In [32]:
params = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter': ['best', 'random']
}

In [33]:
clf_dt = GridSearchCV(
  estimator=DecisionTreeClassifier(),
  param_grid=params,
  scoring='f1',
  cv=skf
)

In [34]:

clf_dt.fit(X_train, y_train)

In [35]:
clf_dt.best_estimator_

In [36]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.590757,0.115356,0.002601,0.000491,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.37037,0.396552,0.277228,0.333333,0.383838,0.352264,0.04307,4
1,0.078805,0.008209,0.002123,0.000475,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.371681,0.366972,0.448598,0.410714,0.389381,0.397469,0.029833,1
2,0.407265,0.01464,0.002794,0.000398,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.358491,0.283019,0.466019,0.4,0.47619,0.396744,0.07144,2
3,0.06121,0.005029,0.002794,0.000397,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.40708,0.344262,0.31068,0.344086,0.317757,0.344773,0.033986,6
4,0.430875,0.015228,0.003103,0.000469,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.299065,0.326923,0.446602,0.4,0.367347,0.367987,0.052234,3
5,0.058774,0.00104,0.002788,0.000394,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.299065,0.39604,0.30303,0.327273,0.40708,0.346498,0.046116,5


In [37]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.821023,0.401175,0.371596,0.62904,0.403774


In [38]:
overview = pd.concat([overview, assess_cv(clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors

In [39]:
params = {
  'n_neighbors': [1, 2, 5, 7, 10],
  'weights': ['uniform', 'distance'],
  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [40]:
clf_kn = GridSearchCV(
  estimator=KNeighborsClassifier(),
  param_grid=params,
  scoring='f1',
  cv=skf
)

In [41]:
clf_kn.fit(X_train, y_train)

In [42]:
clf_kn.best_estimator_

In [43]:
pd.DataFrame(clf_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009198,0.003124,0.07,0.112004,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.333333,0.325581,0.266667,0.306122,0.43299,0.332939,0.055092,5
1,0.008199,0.00204,0.013001,0.000895,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.333333,0.325581,0.266667,0.306122,0.43299,0.332939,0.055092,5
2,0.005787,0.000971,0.011622,0.00048,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.153846,0.276923,0.126984,0.095238,0.184615,0.167521,0.062155,33
3,0.0076,0.001017,0.013191,0.001174,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.333333,0.329412,0.266667,0.309278,0.43299,0.334336,0.054708,1
4,0.006599,0.0008,0.012999,0.002,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.173913,0.276923,0.264706,0.144928,0.26087,0.224268,0.053996,25
5,0.006596,0.001196,0.011529,0.000451,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.246575,0.318841,0.328767,0.246575,0.32,0.292152,0.037371,13
6,0.007201,0.001471,0.013199,0.001719,auto,7,uniform,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.1875,0.333333,0.196721,0.126984,0.166667,0.202241,0.069803,29
7,0.007399,0.00102,0.0108,0.001167,auto,7,distance,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.26087,0.366197,0.298507,0.238806,0.272727,0.287421,0.043847,17
8,0.006199,0.000748,0.012002,0.001096,auto,10,uniform,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.135593,0.169492,0.140351,0.068966,0.036364,0.110153,0.04944,37
9,0.007201,0.000748,0.013795,0.000745,auto,10,distance,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.21875,0.272727,0.298507,0.1875,0.21875,0.239247,0.040375,21


In [44]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.826705,0.334336,0.397082,0.662971,0.290566


In [45]:
overview = pd.concat([overview, assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [46]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.832386,0.439364,0.415094,0.426666,0.779308
SVM,0.864773,0.733398,0.154717,0.254354,0.795804
Naive Bayes,0.701705,0.279711,0.584906,0.373111,0.683997
Decision Tree,0.811364,0.39713,0.396226,0.390997,0.646249
K-Nearest Neighbors,0.826705,0.397082,0.290566,0.334336,0.662971
