In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [4]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [5]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [6]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [7]:
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.linear_model import *

In [8]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

Unnamed: 0,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,A_Marital_Status_Single_Teenhome,A_Marital_Status_Together_Kidhome,A_Marital_Status_Together_Teenhome,A_Marital_Status_Widow_Kidhome,A_Marital_Status_Widow_Teenhome
0,1961,57091.0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,3544,False,True,False,False,0,0,0,0,0,0,0,0
1,1958,67267.0,0,134,11,59,15,2,30,1,3,2,5,2,0,1,3577,False,True,False,False,0,1,0,0,0,0,0,0
2,1967,32474.0,0,10,0,1,0,0,0,1,1,0,2,7,0,2,3401,False,True,False,False,0,0,0,0,1,1,0,0
3,1989,21474.0,0,6,16,24,11,0,34,2,3,1,2,7,0,1,3494,False,True,False,False,0,0,1,0,0,0,0,0
4,1967,44931.0,0,78,0,11,0,0,7,1,2,1,3,5,0,1,3692,False,True,False,False,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,1974,20130.0,99,0,6,3,7,6,12,1,1,0,3,8,0,0,3634,False,False,False,False,0,0,0,0,0,0,0,0
1210,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3404,False,True,False,False,0,0,0,0,0,0,0,1
1211,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3404,False,True,False,False,0,0,0,0,0,0,0,1
1212,1977,31056.0,99,5,10,13,3,8,16,1,1,0,3,8,0,1,4053,False,False,False,False,1,0,0,0,0,0,0,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [10]:
pf = PolynomialFeatures(interaction_only=True)
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [13]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [14]:
def eval(predictions):
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, preds)
    print("Accuracy: %.4f" % acc)
    print("Precision: %.4f" % prec)
    print("Recall: %.4f" % rec)
    print("F1: %.4f" % f1)
    print("AUC: %.4f" % auc)

In [15]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [16]:
params = {
    'max_iter': [50, 70, 100, 200, 500, 1000, 2000, 5000]
}

In [17]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [18]:
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
clf_lr.best_estimator_

In [20]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.029562,0.002627,0.002552,0.00046,50,{'max_iter': 50},0.368421,0.451613,0.5,0.307692,0.432432,0.412032,0.067097,1
1,0.037332,0.002916,0.003302,0.001633,70,{'max_iter': 70},0.378378,0.4,0.411765,0.296296,0.4,0.377288,0.041907,8
2,0.039142,0.002802,0.002629,0.000514,100,{'max_iter': 100},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
3,0.043211,0.002803,0.002821,0.000222,200,{'max_iter': 200},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
4,0.038318,0.003762,0.003104,0.000375,500,{'max_iter': 500},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
5,0.043075,0.003396,0.0025,0.000775,1000,{'max_iter': 1000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
6,0.042208,0.003928,0.002402,0.000376,2000,{'max_iter': 2000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
7,0.046409,0.002828,0.00304,0.000708,5000,{'max_iter': 5000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2


In [21]:
print(pd.DataFrame(clf_lr.cv_results_)[['param_max_iter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lrr}
\toprule
param_max_iter & mean_test_score & rank_test_score \\
\midrule
50 & 0.412032 & 1 \\
70 & 0.377288 & 8 \\
100 & 0.379390 & 2 \\
200 & 0.379390 & 2 \\
500 & 0.379390 & 2 \\
1000 & 0.379390 & 2 \\
2000 & 0.379390 & 2 \\
5000 & 0.379390 & 2 \\
\bottomrule
\end{tabular}



In [22]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.899096,0.412032,0.461903,0.801161,0.388889


In [23]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### SVM


In [24]:
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

In [25]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [26]:
clf_svc.fit(X_train, y_train)

In [27]:
clf_svc.best_estimator_

In [28]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.041382,0.004314,0.00483,0.000524,scale,linear,"{'gamma': 'scale', 'kernel': 'linear'}",0.410256,0.432432,0.444444,0.294118,0.444444,0.405139,0.056898,1
1,0.025544,0.000715,0.006635,0.000404,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.086957,0.0,0.076923,0.0,0.090909,0.050958,0.041856,4
2,0.027721,0.001388,0.015822,0.001092,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,0.0231,0.002383,0.005954,0.000392,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.044046,0.003508,0.005616,0.000597,auto,linear,"{'gamma': 'auto', 'kernel': 'linear'}",0.410256,0.432432,0.444444,0.294118,0.444444,0.405139,0.056898,1
5,0.02752,0.002985,0.00762,0.001237,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.086957,0.0,0.083333,0.0,0.090909,0.05224,0.042721,3
6,0.029011,0.002638,0.015634,0.000631,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
7,0.023708,0.000715,0.006317,0.001036,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [29]:
print(pd.DataFrame(clf_svc.cv_results_)[['param_gamma', 'param_kernel', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{llrr}
\toprule
param_gamma & param_kernel & mean_test_score & rank_test_score \\
\midrule
scale & linear & 0.405139 & 1 \\
scale & poly & 0.050958 & 4 \\
scale & rbf & 0.000000 & 5 \\
scale & sigmoid & 0.000000 & 5 \\
auto & linear & 0.405139 & 1 \\
auto & poly & 0.052240 & 3 \\
auto & rbf & 0.000000 & 5 \\
auto & sigmoid & 0.000000 & 5 \\
\bottomrule
\end{tabular}



In [30]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.888781,0.405139,0.400679,0.770827,0.411111


In [31]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes


In [32]:
params = {
    'alpha': [1.0, 2.0, 5.0, 7, 10],
    'fit_prior': [True, False],
}

In [33]:
clf_nb = GridSearchCV(
    estimator=BernoulliNB(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [34]:
clf_nb.fit(X_train, y_train)

In [35]:
clf_nb.best_estimator_

In [36]:
pd.DataFrame(clf_nb.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_prior,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008491,0.002801,0.003939,0.000854,1.0,True,"{'alpha': 1.0, 'fit_prior': True}",0.25,0.339623,0.309859,0.350877,0.278481,0.305768,0.037556,3
1,0.00763,0.00161,0.003827,0.00025,1.0,False,"{'alpha': 1.0, 'fit_prior': False}",0.263158,0.354839,0.285714,0.323529,0.318182,0.309084,0.03175,2
2,0.008156,0.001249,0.002926,0.000519,2.0,True,"{'alpha': 2.0, 'fit_prior': True}",0.257143,0.333333,0.333333,0.344828,0.285714,0.31087,0.03371,1
3,0.006969,0.001312,0.003719,0.001333,2.0,False,"{'alpha': 2.0, 'fit_prior': False}",0.24,0.338983,0.297297,0.322581,0.285714,0.296915,0.034039,4
4,0.007054,0.00108,0.002814,0.000411,5.0,True,"{'alpha': 5.0, 'fit_prior': True}",0.241379,0.25,0.305085,0.226415,0.297297,0.264035,0.031359,6
5,0.006387,0.00127,0.003416,0.000399,5.0,False,"{'alpha': 5.0, 'fit_prior': False}",0.208955,0.296296,0.307692,0.237288,0.311688,0.272384,0.041517,5
6,0.006721,0.000512,0.003512,0.000317,7.0,True,"{'alpha': 7, 'fit_prior': True}",0.222222,0.232558,0.301887,0.244898,0.307692,0.261851,0.035834,7
7,0.007413,0.001455,0.002917,0.000202,7.0,False,"{'alpha': 7, 'fit_prior': False}",0.20339,0.244898,0.295082,0.222222,0.28169,0.249456,0.034656,8
8,0.006926,0.000769,0.003002,0.000315,10.0,True,"{'alpha': 10, 'fit_prior': True}",0.222222,0.205128,0.25,0.255319,0.310345,0.248603,0.035903,10
9,0.006806,0.001116,0.002932,0.000518,10.0,False,"{'alpha': 10, 'fit_prior': False}",0.188679,0.238095,0.269231,0.25,0.3,0.249201,0.036793,9


In [37]:
print(pd.DataFrame(clf_nb.cv_results_)[['param_alpha', 'param_fit_prior', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{llrr}
\toprule
param_alpha & param_fit_prior & mean_test_score & rank_test_score \\
\midrule
1.000000 & True & 0.305768 & 3 \\
1.000000 & False & 0.309084 & 2 \\
2.000000 & True & 0.310870 & 1 \\
2.000000 & False & 0.296915 & 4 \\
5.000000 & True & 0.264035 & 6 \\
5.000000 & False & 0.272384 & 5 \\
7 & True & 0.261851 & 7 \\
7 & False & 0.249456 & 8 \\
10 & True & 0.248603 & 10 \\
10 & False & 0.249201 & 9 \\
\bottomrule
\end{tabular}



In [38]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb.best_estimator_, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.768316,0.31087,0.217737,0.760116,0.555556


In [39]:
tt = pd.concat([assess_cv(clf_nb.best_estimator_, 'Bernoulli NB'), assess_cv(GaussianNB(), 'Gaussian NB')])

In [40]:
print(tt.to_latex())

\begin{tabular}{lrrrrr}
\toprule
 & Accuracy & F1 Score & Precision & ROC AUC & Recall \\
\midrule
Bernoulli NB & 0.768316 & 0.310870 & 0.217737 & 0.760116 & 0.555556 \\
Gaussian NB & 0.244092 & 0.176973 & 0.098413 & 0.530368 & 0.877778 \\
\bottomrule
\end{tabular}



In [41]:
tt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Bernoulli NB,0.768316,0.31087,0.217737,0.760116,0.555556
Gaussian NB,0.244092,0.176973,0.098413,0.530368,0.877778


In [42]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [43]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10]
}

In [44]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [45]:

clf_dt.fit(X_train, y_train)

In [46]:
clf_dt.best_estimator_

In [47]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.077695,0.009318,0.002126,0.00019,gini,2,best,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.285714,0.473684,0.418605,0.5,0.473684,0.430337,0.077025,2
1,0.02247,0.003412,0.002826,0.001183,gini,2,random,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.341463,0.341463,0.458333,0.451613,0.372093,0.392993,0.051871,10
2,0.095666,0.017798,0.003165,0.001054,gini,5,best,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.35,0.473684,0.243902,0.411765,0.5,0.39587,0.092031,9
3,0.01625,0.001504,0.002585,0.000555,gini,5,random,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.307692,0.342857,0.45,0.277778,0.25641,0.326947,0.068057,17
4,0.084888,0.021339,0.002597,0.000209,gini,10,best,"{'criterion': 'gini', 'min_samples_split': 10,...",0.35,0.470588,0.418605,0.387097,0.4,0.405258,0.039641,7
5,0.018394,0.002828,0.002974,0.0009,gini,10,random,"{'criterion': 'gini', 'min_samples_split': 10,...",0.352941,0.388889,0.6,0.388889,0.263158,0.398775,0.110616,8
6,0.063903,0.003164,0.003123,0.000353,entropy,2,best,"{'criterion': 'entropy', 'min_samples_split': ...",0.457143,0.526316,0.5,0.322581,0.358974,0.433003,0.079312,1
7,0.019913,0.006363,0.00401,0.001094,entropy,2,random,"{'criterion': 'entropy', 'min_samples_split': ...",0.421053,0.457143,0.444444,0.25641,0.285714,0.372953,0.084505,15
8,0.06114,0.004329,0.00251,0.000763,entropy,5,best,"{'criterion': 'entropy', 'min_samples_split': ...",0.388889,0.526316,0.380952,0.375,0.4,0.414231,0.056664,5
9,0.014553,0.001152,0.002362,0.000296,entropy,5,random,"{'criterion': 'entropy', 'min_samples_split': ...",0.378378,0.35,0.4,0.444444,0.378378,0.39024,0.031413,11


In [48]:
print(pd.DataFrame(clf_dt.cv_results_)[['param_criterion', 'param_min_samples_split', 'param_splitter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lllrr}
\toprule
param_criterion & param_min_samples_split & param_splitter & mean_test_score & rank_test_score \\
\midrule
gini & 2 & best & 0.430337 & 2 \\
gini & 2 & random & 0.392993 & 10 \\
gini & 5 & best & 0.395870 & 9 \\
gini & 5 & random & 0.326947 & 17 \\
gini & 10 & best & 0.405258 & 7 \\
gini & 10 & random & 0.398775 & 8 \\
entropy & 2 & best & 0.433003 & 1 \\
entropy & 2 & random & 0.372953 & 15 \\
entropy & 5 & best & 0.414231 & 5 \\
entropy & 5 & random & 0.390240 & 11 \\
entropy & 10 & best & 0.376366 & 13 \\
entropy & 10 & random & 0.369487 & 16 \\
log_loss & 2 & best & 0.422616 & 4 \\
log_loss & 2 & random & 0.377928 & 12 \\
log_loss & 5 & best & 0.423981 & 3 \\
log_loss & 5 & random & 0.299430 & 18 \\
log_loss & 10 & best & 0.408290 & 6 \\
log_loss & 10 & random & 0.373639 & 14 \\
\bottomrule
\end{tabular}



In [49]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.892884,0.416512,0.443924,0.685755,0.388889


In [50]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [51]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [52]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [53]:
clf_kn.fit(X_train, y_train)

In [54]:
clf_kn.best_estimator_

In [55]:
pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002875,0.000371,0.049843,0.079394,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
11,0.014721,0.001438,0.090652,0.012097,ball_tree,1,distance,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
30,0.004373,0.002139,0.012109,0.003051,brute,1,uniform,"{'algorithm': 'brute', 'n_neighbors': 1, 'weig...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
31,0.003103,0.000666,0.010889,0.001422,brute,1,distance,"{'algorithm': 'brute', 'n_neighbors': 1, 'weig...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
10,0.014819,0.00283,0.087332,0.008724,ball_tree,1,uniform,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
1,0.002914,0.000201,0.012271,0.001628,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
21,0.022688,0.003593,0.083085,0.005694,kd_tree,1,distance,"{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...",0.235294,0.410256,0.27027,0.307692,0.363636,0.31743,0.062955,7
20,0.021393,0.001952,0.091874,0.003654,kd_tree,1,uniform,"{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...",0.235294,0.410256,0.27027,0.307692,0.363636,0.31743,0.062955,7
33,0.002303,0.0004,0.00995,0.000707,brute,2,distance,"{'algorithm': 'brute', 'n_neighbors': 2, 'weig...",0.235294,0.368421,0.277778,0.307692,0.375,0.312837,0.053334,9
3,0.003316,0.000874,0.01413,0.001774,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.235294,0.368421,0.277778,0.307692,0.375,0.312837,0.053334,9


In [56]:
print(pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')[['param_algorithm', 'param_n_neighbors', 'param_weights', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lllrr}
\toprule
param_algorithm & param_n_neighbors & param_weights & mean_test_score & rank_test_score \\
\midrule
auto & 1 & uniform & 0.319703 & 1 \\
ball_tree & 1 & distance & 0.319703 & 1 \\
brute & 1 & uniform & 0.319703 & 1 \\
brute & 1 & distance & 0.319703 & 1 \\
ball_tree & 1 & uniform & 0.319703 & 1 \\
auto & 1 & distance & 0.319703 & 1 \\
kd_tree & 1 & distance & 0.317430 & 7 \\
kd_tree & 1 & uniform & 0.317430 & 7 \\
brute & 2 & distance & 0.312837 & 9 \\
auto & 2 & distance & 0.312837 & 9 \\
ball_tree & 2 & distance & 0.312837 & 9 \\
kd_tree & 2 & distance & 0.312837 & 9 \\
auto & 7 & distance & 0.231143 & 13 \\
brute & 7 & distance & 0.231143 & 13 \\
kd_tree & 7 & distance & 0.231143 & 13 \\
ball_tree & 7 & distance & 0.231143 & 13 \\
auto & 5 & distance & 0.223317 & 17 \\
ball_tree & 5 & distance & 0.223317 & 17 \\
kd_tree & 5 & distance & 0.223317 & 17 \\
brute & 5 & distance & 0.223317 & 17 \\
kd_tree & 10 & distance & 0.200369 & 21 \\
ball_tree & 10 &

In [57]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.882612,0.319703,0.364536,0.621061,0.3


In [58]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [59]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.899096,0.461903,0.388889,0.412032,0.801161
SVM,0.888781,0.400679,0.411111,0.405139,0.770827
Naive Bayes,0.761121,0.205177,0.533333,0.29414,0.75915
Decision Tree,0.893931,0.421946,0.488889,0.428247,0.664012
K-Nearest Neighbors,0.882612,0.364536,0.3,0.319703,0.621061


In [60]:
overview.T

Unnamed: 0,Logistic Regression,SVM,Naive Bayes,Decision Tree,K-Nearest Neighbors
Accuracy,0.899096,0.888781,0.761121,0.893931,0.882612
Precision,0.461903,0.400679,0.205177,0.421946,0.364536
Recall,0.388889,0.411111,0.533333,0.488889,0.3
F1 Score,0.412032,0.405139,0.29414,0.428247,0.319703
ROC AUC,0.801161,0.770827,0.75915,0.664012,0.621061


In [61]:
print(overview.to_latex())

\begin{tabular}{lrrrrr}
\toprule
 & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
Logistic Regression & 0.899096 & 0.461903 & 0.388889 & 0.412032 & 0.801161 \\
SVM & 0.888781 & 0.400679 & 0.411111 & 0.405139 & 0.770827 \\
Naive Bayes & 0.761121 & 0.205177 & 0.533333 & 0.294140 & 0.759150 \\
Decision Tree & 0.893931 & 0.421946 & 0.488889 & 0.428247 & 0.664012 \\
K-Nearest Neighbors & 0.882612 & 0.364536 & 0.300000 & 0.319703 & 0.621061 \\
\bottomrule
\end{tabular}



## Evaluation

### Logistic Regression

In [62]:
pd.DataFrame(pf.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435
0,1,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Teenhome A_Marital_S...,A_Marital_Status_Together_Teenhome A_Marital_S...,A_Marital_Status_Widow_Kidhome A_Marital_Statu...


In [63]:
lr = clf_lr.best_estimator_

In [64]:
coefficients = pd.DataFrame(data=lr.coef_, columns=pf.get_feature_names_out())

In [65]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [66]:
pd.DataFrame(data=top5.values, index=top5.index)

Unnamed: 0,0
MntFruits A_Marital_Status_Single_Kidhome,0.96431
MntMeatProducts A_Marital_Status_Married_Teenhome,0.909317
MntWines A_Marital_Status_Together_Teenhome,0.904126
MntWines NumWebPurchases,0.85196
Education_Master A_Marital_Status_Single_Kidhome,0.77263


### SVM

In [67]:
svm = clf_svc.best_estimator_

In [68]:
coefficients = pd.DataFrame(data=svm.coef_, columns=pf.get_feature_names_out())

In [69]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [70]:
pd.DataFrame(top5)

Unnamed: 0,0
MntWines NumWebPurchases,0.904773
A_Marital_Status_Married_Kidhome A_Marital_Status_Married_Teenhome,0.804857
MntFruits A_Marital_Status_Single_Kidhome,0.803837
NumWebVisitsMonth A_Marital_Status_Single_Kidhome,0.76976
Recency Days_Since_Customer,0.733147


In [71]:
print(pd.DataFrame(top5).to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
MntWines NumWebPurchases & 0.904773 \\
A_Marital_Status_Married_Kidhome A_Marital_Status_Married_Teenhome & 0.804857 \\
MntFruits A_Marital_Status_Single_Kidhome & 0.803837 \\
NumWebVisitsMonth A_Marital_Status_Single_Kidhome & 0.769760 \\
Recency Days_Since_Customer & 0.733147 \\
\bottomrule
\end{tabular}



In [72]:
svecs = pd.DataFrame(sc.inverse_transform(svm.support_vectors_), columns=pf.get_feature_names_out())

In [73]:
print(svecs.mean())

1                                                                         1.000000
Year_Birth                                                             1970.030303
Income                                                                37390.267677
Recency                                                                  30.606061
MntWines                                                                101.590909
                                                                          ...     
A_Marital_Status_Together_Kidhome A_Marital_Status_Widow_Kidhome          0.000000
A_Marital_Status_Together_Kidhome A_Marital_Status_Widow_Teenhome         0.000000
A_Marital_Status_Together_Teenhome A_Marital_Status_Widow_Kidhome         0.000000
A_Marital_Status_Together_Teenhome A_Marital_Status_Widow_Teenhome        0.000000
A_Marital_Status_Widow_Kidhome A_Marital_Status_Widow_Teenhome            0.015152
Length: 436, dtype: float64


In [74]:
X_train.shape

(971, 436)

In [75]:
svecs.shape

(198, 436)

In [76]:
svm.n_support_

array([131,  67])

In [77]:
198/971

0.203913491246138

### Naive Bayes

In [78]:
nb = clf_nb.best_estimator_

In [79]:
df.value_counts('Response')

Response
0    1101
1     113
Name: count, dtype: int64

In [80]:
nb.class_log_prior_

array([-0.09726884, -2.3785168 ])

In [81]:
flp = pd.DataFrame(nb.feature_log_prob_, columns=pf.get_feature_names_out())

In [92]:
pd.DataFrame(flp.loc[0].sort_values(ascending=False))

Unnamed: 0,0
Year_Birth,-0.624380
Year_Birth Recency,-0.641402
NumCatalogPurchases Days_Since_Customer,-0.650023
NumCatalogPurchases,-0.650023
Recency,-0.650023
...,...
A_Marital_Status_Married_Kidhome A_Marital_Status_Single_Kidhome,-6.092440
A_Marital_Status_Married_Kidhome A_Marital_Status_Single_Teenhome,-6.092440
A_Marital_Status_Married_Kidhome A_Marital_Status_Together_Kidhome,-6.092440
A_Marital_Status_Married_Kidhome A_Marital_Status_Together_Teenhome,-6.092440


In [85]:
print(pd.DataFrame(flp.loc[0].sort_values(ascending=False).iloc[:5]).to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Year_Birth & -0.624380 \\
Year_Birth Recency & -0.641402 \\
NumCatalogPurchases Days_Since_Customer & -0.650023 \\
NumCatalogPurchases & -0.650023 \\
Recency & -0.650023 \\
\bottomrule
\end{tabular}



In [83]:
print(pd.DataFrame(flp.loc[1].sort_values(ascending=False).iloc[:5]).to_latex())

\begin{tabular}{lr}
\toprule
 & 1 \\
\midrule
NumWebVisitsMonth Days_Since_Customer & -0.148846 \\
NumCatalogPurchases & -0.239230 \\
Year_Birth NumCatalogPurchases & -0.239230 \\
NumCatalogPurchases Days_Since_Customer & -0.239230 \\
NumCatalogPurchases NumWebVisitsMonth & -0.252835 \\
\bottomrule
\end{tabular}

