In [788]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [789]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [790]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [791]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [792]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [793]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [794]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [795]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

Unnamed: 0,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,A_Marital_Status_Single_Teenhome,A_Marital_Status_Together_Kidhome,A_Marital_Status_Together_Teenhome,A_Marital_Status_Widow_Kidhome,A_Marital_Status_Widow_Teenhome
0,1961,57091.0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,3542,False,True,False,False,0,0,0,0,0,0,0,0
1,1958,67267.0,0,134,11,59,15,2,30,1,3,2,5,2,0,1,3575,False,True,False,False,0,1,0,0,0,0,0,0
2,1967,32474.0,0,10,0,1,0,0,0,1,1,0,2,7,0,2,3399,False,True,False,False,0,0,0,0,1,1,0,0
3,1989,21474.0,0,6,16,24,11,0,34,2,3,1,2,7,0,1,3492,False,True,False,False,0,0,1,0,0,0,0,0
4,1967,44931.0,0,78,0,11,0,0,7,1,2,1,3,5,0,1,3690,False,True,False,False,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,1974,20130.0,99,0,6,3,7,6,12,1,1,0,3,8,0,0,3632,False,False,False,False,0,0,0,0,0,0,0,0
1210,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1211,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1212,1977,31056.0,99,5,10,13,3,8,16,1,1,0,3,8,0,1,4051,False,False,False,False,1,0,0,0,0,0,0,0


In [796]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [797]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [798]:
pf = PolynomialFeatures(interaction_only=True)
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [799]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [800]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [801]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [802]:
params = {
    'max_iter': [100, 200, 500, 1000]
}

In [803]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [804]:
clf_lr.fit(X_train, y_train)

In [805]:
clf_lr.best_estimator_

In [806]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.054637,0.003118,0.002205,0.000398,100,{'max_iter': 100},0.5,0.439024,0.5,0.384615,0.387097,0.442147,0.051077,1
1,0.069936,0.013726,0.002799,0.000749,200,{'max_iter': 200},0.5,0.439024,0.5,0.384615,0.387097,0.442147,0.051077,1
2,0.062579,0.002446,0.002599,0.00049,500,{'max_iter': 500},0.5,0.439024,0.5,0.384615,0.387097,0.442147,0.051077,1
3,0.055128,0.001382,0.001804,0.000393,1000,{'max_iter': 1000},0.5,0.439024,0.5,0.384615,0.387097,0.442147,0.051077,1


In [807]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.901137,0.442147,0.486478,0.786696,0.433333


In [808]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

### SVM


In [809]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [810]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [811]:
clf_svc.fit(X_train, y_train)

In [812]:
clf_svc.best_estimator_

In [813]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.030494,0.003912,0.007313,0.000595,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,0.025112,0.001116,0.014105,0.000219,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,0.018718,0.000871,0.00499,1.9e-05,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.027525,0.002311,0.006782,0.000403,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.095238,0.0,0.083333,0.0,0.0,0.035714,0.043903,1
4,0.025821,0.000935,0.014605,0.000491,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,0.015913,0.000644,0.005003,2.3e-05,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.105263,0.021053,0.042105,2


In [814]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.89907,0.035714,0.1,0.749592,0.022222


In [815]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Naive Bayes


In [816]:
clf_nb = GaussianNB()

In [817]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.304869,0.169004,0.095007,0.543833,0.766667


In [818]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [819]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}

In [820]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [821]:

clf_dt.fit(X_train, y_train)

In [822]:
clf_dt.best_estimator_

In [823]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.143382,0.022264,0.001602,0.000485,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.410256,0.421053,0.35,0.3125,0.322581,0.363278,0.044623,4
1,0.017802,0.002655,0.002113,0.00021,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.294118,0.444444,0.478261,0.333333,0.195122,0.349056,0.102717,6
2,0.090351,0.004715,0.002003,6e-06,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.307692,0.4,0.380952,0.545455,0.285714,0.383963,0.09145,2
3,0.014607,0.000794,0.002001,1.1e-05,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.263158,0.388889,0.352941,0.4,0.363636,0.353725,0.04833,5
4,0.089633,0.004693,0.002002,1.2e-05,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.324324,0.461538,0.363636,0.540541,0.25,0.388008,0.102341,1
5,0.015401,0.001024,0.001509,0.000644,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.333333,0.333333,0.52381,0.277778,0.4,0.373651,0.08448,3


In [824]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.882612,0.391121,0.391404,0.682399,0.422222


In [825]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [826]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [827]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [828]:
clf_kn.fit(X_train, y_train)

In [829]:
clf_kn.best_estimator_

In [830]:
pd.DataFrame(clf_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003201,0.0004,0.019707,0.002893,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.285714,0.375,0.162162,0.285714,0.375,0.296718,0.078235,5
1,0.002709,0.000745,0.016403,0.004323,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.285714,0.375,0.162162,0.285714,0.375,0.296718,0.078235,5
2,0.002599,0.000489,0.013313,0.001089,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.095238,0.1,0.090909,0.190476,0.285714,0.152468,0.07618,25
3,0.002596,0.00049,0.013974,0.002538,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.285714,0.375,0.171429,0.285714,0.375,0.298571,0.075071,1
4,0.002502,0.000447,0.012743,0.00155,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.0,0.1,0.0,0.105263,0.2,0.081053,0.075143,29
5,0.002203,0.000399,0.013502,0.000632,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.230769,0.32,0.1,0.190476,0.363636,0.240976,0.093615,13
6,0.002309,0.0004,0.012198,0.001174,auto,7,uniform,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.0,0.0,0.0,0.0,0.105263,0.021053,0.042105,33
7,0.002403,0.000492,0.012438,0.000467,auto,7,distance,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.25,0.25,0.105263,0.1,0.285714,0.198195,0.079127,21
8,0.001802,0.0004,0.0108,0.000979,auto,10,uniform,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
9,0.001803,0.000399,0.0116,0.00102,auto,10,distance,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.25,0.26087,0.105263,0.1,0.285714,0.200369,0.080655,17


In [831]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.882606,0.298571,0.345546,0.627993,0.266667


In [832]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [833]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.901137,0.486478,0.433333,0.442147,0.786696
SVM,0.89907,0.1,0.022222,0.035714,0.749592
Naive Bayes,0.304869,0.095007,0.766667,0.169004,0.543833
Decision Tree,0.878488,0.434524,0.333333,0.399519,0.67555
K-Nearest Neighbors,0.882606,0.345546,0.266667,0.298571,0.627993
