In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [4]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [5]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [6]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [7]:
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.linear_model import *

In [8]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

Unnamed: 0,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,A_Marital_Status_Single_Teenhome,A_Marital_Status_Together_Kidhome,A_Marital_Status_Together_Teenhome,A_Marital_Status_Widow_Kidhome,A_Marital_Status_Widow_Teenhome
0,1961,57091.0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,3544,False,True,False,False,0,0,0,0,0,0,0,0
1,1958,67267.0,0,134,11,59,15,2,30,1,3,2,5,2,0,1,3577,False,True,False,False,0,1,0,0,0,0,0,0
2,1967,32474.0,0,10,0,1,0,0,0,1,1,0,2,7,0,2,3401,False,True,False,False,0,0,0,0,1,1,0,0
3,1989,21474.0,0,6,16,24,11,0,34,2,3,1,2,7,0,1,3494,False,True,False,False,0,0,1,0,0,0,0,0
4,1967,44931.0,0,78,0,11,0,0,7,1,2,1,3,5,0,1,3692,False,True,False,False,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,1974,20130.0,99,0,6,3,7,6,12,1,1,0,3,8,0,0,3634,False,False,False,False,0,0,0,0,0,0,0,0
1210,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3404,False,True,False,False,0,0,0,0,0,0,0,1
1211,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3404,False,True,False,False,0,0,0,0,0,0,0,1
1212,1977,31056.0,99,5,10,13,3,8,16,1,1,0,3,8,0,1,4053,False,False,False,False,1,0,0,0,0,0,0,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [10]:
pf = PolynomialFeatures(interaction_only=True)
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [13]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [14]:
def eval(predictions):
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, preds)
    print("Accuracy: %.4f" % acc)
    print("Precision: %.4f" % prec)
    print("Recall: %.4f" % rec)
    print("F1: %.4f" % f1)
    print("AUC: %.4f" % auc)

In [15]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [16]:
params = {
    'max_iter': [50, 70, 100, 200, 500, 1000, 2000, 5000]
}

In [17]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [18]:
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
clf_lr.best_estimator_

In [20]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.028834,0.001948,0.002399,0.00049,50,{'max_iter': 50},0.368421,0.451613,0.5,0.307692,0.432432,0.412032,0.067097,1
1,0.038751,0.003552,0.0024,0.00049,70,{'max_iter': 70},0.378378,0.4,0.411765,0.296296,0.4,0.377288,0.041907,8
2,0.039811,0.004346,0.002197,0.000501,100,{'max_iter': 100},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
3,0.037406,0.002823,0.002608,0.000378,200,{'max_iter': 200},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
4,0.037519,0.002739,0.002359,0.000373,500,{'max_iter': 500},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
5,0.039582,0.003995,0.002726,0.000509,1000,{'max_iter': 1000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
6,0.039013,0.00544,0.002299,0.000401,2000,{'max_iter': 2000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2
7,0.03919,0.003241,0.002203,0.000508,5000,{'max_iter': 5000},0.388889,0.4,0.411765,0.296296,0.4,0.37939,0.042172,2


In [21]:
print(pd.DataFrame(clf_lr.cv_results_)[['param_max_iter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lrr}
\toprule
param_max_iter & mean_test_score & rank_test_score \\
\midrule
50 & 0.412032 & 1 \\
70 & 0.377288 & 8 \\
100 & 0.379390 & 2 \\
200 & 0.379390 & 2 \\
500 & 0.379390 & 2 \\
1000 & 0.379390 & 2 \\
2000 & 0.379390 & 2 \\
5000 & 0.379390 & 2 \\
\bottomrule
\end{tabular}



In [22]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.899096,0.412032,0.461903,0.801161,0.388889


In [23]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### SVM


In [24]:
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

In [25]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [26]:
clf_svc.fit(X_train, y_train)

In [27]:
clf_svc.best_estimator_

In [28]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0467,0.007026,0.005409,0.00037,scale,linear,"{'gamma': 'scale', 'kernel': 'linear'}",0.410256,0.432432,0.444444,0.294118,0.444444,0.405139,0.056898,1
1,0.028911,0.001819,0.007118,0.000594,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.086957,0.0,0.076923,0.0,0.090909,0.050958,0.041856,4
2,0.031179,0.004204,0.017123,0.002511,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,0.024945,0.003231,0.006291,0.000714,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.043351,0.00948,0.005525,0.000819,auto,linear,"{'gamma': 'auto', 'kernel': 'linear'}",0.410256,0.432432,0.444444,0.294118,0.444444,0.405139,0.056898,1
5,0.024989,0.001911,0.007203,0.000255,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.086957,0.0,0.083333,0.0,0.090909,0.05224,0.042721,3
6,0.025819,0.001399,0.014873,0.001015,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
7,0.01891,0.001012,0.005371,0.000575,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [29]:
print(pd.DataFrame(clf_svc.cv_results_)[['param_gamma', 'param_kernel', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{llrr}
\toprule
param_gamma & param_kernel & mean_test_score & rank_test_score \\
\midrule
scale & linear & 0.405139 & 1 \\
scale & poly & 0.050958 & 4 \\
scale & rbf & 0.000000 & 5 \\
scale & sigmoid & 0.000000 & 5 \\
auto & linear & 0.405139 & 1 \\
auto & poly & 0.052240 & 3 \\
auto & rbf & 0.000000 & 5 \\
auto & sigmoid & 0.000000 & 5 \\
\bottomrule
\end{tabular}



In [30]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.888781,0.405139,0.400679,0.770827,0.411111


In [31]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes


In [32]:
params = {
    'alpha': [1.0, 2.0, 5.0, 7, 10],
    'fit_prior': [True, False],
}

In [33]:
clf_nb = GridSearchCV(
    estimator=BernoulliNB(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [34]:
clf_nb.fit(X_train, y_train)

In [35]:
clf_nb.best_estimator_

In [36]:
pd.DataFrame(clf_nb.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_prior,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008604,0.002252,0.00291,0.000795,1.0,True,"{'alpha': 1.0, 'fit_prior': True}",0.25,0.339623,0.309859,0.350877,0.278481,0.305768,0.037556,3
1,0.007903,0.001768,0.00332,0.000602,1.0,False,"{'alpha': 1.0, 'fit_prior': False}",0.263158,0.354839,0.285714,0.323529,0.318182,0.309084,0.03175,2
2,0.007548,0.001064,0.003622,0.000728,2.0,True,"{'alpha': 2.0, 'fit_prior': True}",0.257143,0.333333,0.333333,0.344828,0.285714,0.31087,0.03371,1
3,0.006556,0.001067,0.002811,0.000406,2.0,False,"{'alpha': 2.0, 'fit_prior': False}",0.24,0.338983,0.297297,0.322581,0.285714,0.296915,0.034039,4
4,0.00734,0.001003,0.003313,0.00041,5.0,True,"{'alpha': 5.0, 'fit_prior': True}",0.241379,0.25,0.305085,0.226415,0.297297,0.264035,0.031359,6
5,0.007785,0.000961,0.004519,0.000869,5.0,False,"{'alpha': 5.0, 'fit_prior': False}",0.208955,0.296296,0.307692,0.237288,0.311688,0.272384,0.041517,5
6,0.006016,0.00066,0.002702,0.000398,7.0,True,"{'alpha': 7, 'fit_prior': True}",0.222222,0.232558,0.301887,0.244898,0.307692,0.261851,0.035834,7
7,0.006913,0.001847,0.003438,0.000542,7.0,False,"{'alpha': 7, 'fit_prior': False}",0.20339,0.244898,0.295082,0.222222,0.28169,0.249456,0.034656,8
8,0.00671,0.001487,0.003104,0.000206,10.0,True,"{'alpha': 10, 'fit_prior': True}",0.222222,0.205128,0.25,0.255319,0.310345,0.248603,0.035903,10
9,0.005711,0.000616,0.003174,0.000214,10.0,False,"{'alpha': 10, 'fit_prior': False}",0.188679,0.238095,0.269231,0.25,0.3,0.249201,0.036793,9


In [37]:
print(pd.DataFrame(clf_nb.cv_results_)[['param_alpha', 'param_fit_prior', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{llrr}
\toprule
param_alpha & param_fit_prior & mean_test_score & rank_test_score \\
\midrule
1.000000 & True & 0.305768 & 3 \\
1.000000 & False & 0.309084 & 2 \\
2.000000 & True & 0.310870 & 1 \\
2.000000 & False & 0.296915 & 4 \\
5.000000 & True & 0.264035 & 6 \\
5.000000 & False & 0.272384 & 5 \\
7 & True & 0.261851 & 7 \\
7 & False & 0.249456 & 8 \\
10 & True & 0.248603 & 10 \\
10 & False & 0.249201 & 9 \\
\bottomrule
\end{tabular}



In [38]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb.best_estimator_, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.768316,0.31087,0.217737,0.760116,0.555556


In [39]:
tt = pd.concat([assess_cv(clf_nb.best_estimator_, 'Bernoulli NB'), assess_cv(GaussianNB(), 'Gaussian NB')])

In [40]:
print(tt.to_latex())

\begin{tabular}{lrrrrr}
\toprule
 & Accuracy & F1 Score & Precision & ROC AUC & Recall \\
\midrule
Bernoulli NB & 0.768316 & 0.310870 & 0.217737 & 0.760116 & 0.555556 \\
Gaussian NB & 0.244092 & 0.176973 & 0.098413 & 0.530368 & 0.877778 \\
\bottomrule
\end{tabular}



In [41]:
tt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Bernoulli NB,0.768316,0.31087,0.217737,0.760116,0.555556
Gaussian NB,0.244092,0.176973,0.098413,0.530368,0.877778


In [42]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [43]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10]
}

In [44]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [45]:

clf_dt.fit(X_train, y_train)

In [46]:
clf_dt.best_estimator_

In [47]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.089431,0.011931,0.002905,0.000501,gini,2,best,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.372093,0.5,0.292683,0.378378,0.3125,0.371131,0.07246,12
1,0.023107,0.003731,0.00274,0.000378,gini,2,random,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.4,0.352941,0.454545,0.333333,0.322581,0.37268,0.048777,11
2,0.087051,0.011699,0.003138,0.000587,gini,5,best,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.35,0.484848,0.171429,0.388889,0.378378,0.354709,0.102253,17
3,0.021826,0.003036,0.003306,0.000619,gini,5,random,"{'criterion': 'gini', 'min_samples_split': 5, ...",0.410256,0.380952,0.45,0.258065,0.333333,0.366521,0.066263,14
4,0.087939,0.011521,0.002535,0.000485,gini,10,best,"{'criterion': 'gini', 'min_samples_split': 10,...",0.341463,0.411765,0.285714,0.5,0.484848,0.404758,0.082112,8
5,0.021433,0.002929,0.002895,0.000491,gini,10,random,"{'criterion': 'gini', 'min_samples_split': 10,...",0.30303,0.25,0.512821,0.344828,0.424242,0.366984,0.092534,13
6,0.094425,0.021916,0.004325,0.000884,entropy,2,best,"{'criterion': 'entropy', 'min_samples_split': ...",0.444444,0.526316,0.380952,0.375,0.342857,0.413914,0.06514,6
7,0.023802,0.004279,0.004015,0.000883,entropy,2,random,"{'criterion': 'entropy', 'min_samples_split': ...",0.318182,0.324324,0.423077,0.363636,0.30303,0.34645,0.043223,18
8,0.078825,0.001813,0.003638,0.00078,entropy,5,best,"{'criterion': 'entropy', 'min_samples_split': ...",0.4375,0.526316,0.390244,0.344828,0.4,0.419777,0.060902,5
9,0.021747,0.002811,0.003242,0.000467,entropy,5,random,"{'criterion': 'entropy', 'min_samples_split': ...",0.380952,0.410256,0.428571,0.4375,0.536585,0.438773,0.052592,2


In [48]:
print(pd.DataFrame(clf_dt.cv_results_)[['param_criterion', 'param_min_samples_split', 'param_splitter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lllrr}
\toprule
param_criterion & param_min_samples_split & param_splitter & mean_test_score & rank_test_score \\
\midrule
gini & 2 & best & 0.371131 & 12 \\
gini & 2 & random & 0.372680 & 11 \\
gini & 5 & best & 0.354709 & 17 \\
gini & 5 & random & 0.366521 & 14 \\
gini & 10 & best & 0.404758 & 8 \\
gini & 10 & random & 0.366984 & 13 \\
entropy & 2 & best & 0.413914 & 6 \\
entropy & 2 & random & 0.346450 & 18 \\
entropy & 5 & best & 0.419777 & 5 \\
entropy & 5 & random & 0.438773 & 2 \\
entropy & 10 & best & 0.387127 & 10 \\
entropy & 10 & random & 0.355047 & 16 \\
log_loss & 2 & best & 0.469467 & 1 \\
log_loss & 2 & random & 0.357019 & 15 \\
log_loss & 5 & best & 0.433705 & 3 \\
log_loss & 5 & random & 0.405963 & 7 \\
log_loss & 10 & best & 0.431012 & 4 \\
log_loss & 10 & random & 0.402344 & 9 \\
\bottomrule
\end{tabular}



In [49]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.889802,0.431554,0.445091,0.690812,0.422222


In [50]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [51]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [52]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [53]:
clf_kn.fit(X_train, y_train)

In [54]:
clf_kn.best_estimator_

In [55]:
pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004222,0.000735,0.048089,0.074214,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
11,0.016288,0.0015,0.090555,0.008335,ball_tree,1,distance,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
30,0.002748,0.000723,0.010769,0.001081,brute,1,uniform,"{'algorithm': 'brute', 'n_neighbors': 1, 'weig...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
31,0.003023,0.000647,0.011731,0.001422,brute,1,distance,"{'algorithm': 'brute', 'n_neighbors': 1, 'weig...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
10,0.018684,0.005375,0.100057,0.012486,ball_tree,1,uniform,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
1,0.002805,0.000751,0.01231,0.000282,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.235294,0.410256,0.27027,0.307692,0.375,0.319703,0.064762,1
21,0.022234,0.001816,0.088475,0.003772,kd_tree,1,distance,"{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...",0.235294,0.410256,0.27027,0.307692,0.363636,0.31743,0.062955,7
20,0.022958,0.00133,0.095305,0.003056,kd_tree,1,uniform,"{'algorithm': 'kd_tree', 'n_neighbors': 1, 'we...",0.235294,0.410256,0.27027,0.307692,0.363636,0.31743,0.062955,7
33,0.002902,0.000494,0.01179,0.001564,brute,2,distance,"{'algorithm': 'brute', 'n_neighbors': 2, 'weig...",0.235294,0.368421,0.277778,0.307692,0.375,0.312837,0.053334,9
3,0.002946,0.000672,0.011516,0.000727,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.235294,0.368421,0.277778,0.307692,0.375,0.312837,0.053334,9


In [56]:
print(pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')[['param_algorithm', 'param_n_neighbors', 'param_weights', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

\begin{tabular}{lllrr}
\toprule
param_algorithm & param_n_neighbors & param_weights & mean_test_score & rank_test_score \\
\midrule
auto & 1 & uniform & 0.319703 & 1 \\
ball_tree & 1 & distance & 0.319703 & 1 \\
brute & 1 & uniform & 0.319703 & 1 \\
brute & 1 & distance & 0.319703 & 1 \\
ball_tree & 1 & uniform & 0.319703 & 1 \\
auto & 1 & distance & 0.319703 & 1 \\
kd_tree & 1 & distance & 0.317430 & 7 \\
kd_tree & 1 & uniform & 0.317430 & 7 \\
brute & 2 & distance & 0.312837 & 9 \\
auto & 2 & distance & 0.312837 & 9 \\
ball_tree & 2 & distance & 0.312837 & 9 \\
kd_tree & 2 & distance & 0.312837 & 9 \\
auto & 7 & distance & 0.231143 & 13 \\
brute & 7 & distance & 0.231143 & 13 \\
kd_tree & 7 & distance & 0.231143 & 13 \\
ball_tree & 7 & distance & 0.231143 & 13 \\
auto & 5 & distance & 0.223317 & 17 \\
ball_tree & 5 & distance & 0.223317 & 17 \\
kd_tree & 5 & distance & 0.223317 & 17 \\
brute & 5 & distance & 0.223317 & 17 \\
kd_tree & 10 & distance & 0.200369 & 21 \\
ball_tree & 10 &

In [57]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.882612,0.319703,0.364536,0.621061,0.3


In [58]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [59]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.899096,0.461903,0.388889,0.412032,0.801161
SVM,0.888781,0.400679,0.411111,0.405139,0.770827
Naive Bayes,0.761121,0.205177,0.533333,0.29414,0.75915
Decision Tree,0.88875,0.443947,0.433333,0.432849,0.706535
K-Nearest Neighbors,0.882612,0.364536,0.3,0.319703,0.621061


In [60]:
overview.T

Unnamed: 0,Logistic Regression,SVM,Naive Bayes,Decision Tree,K-Nearest Neighbors
Accuracy,0.899096,0.888781,0.761121,0.88875,0.882612
Precision,0.461903,0.400679,0.205177,0.443947,0.364536
Recall,0.388889,0.411111,0.533333,0.433333,0.3
F1 Score,0.412032,0.405139,0.29414,0.432849,0.319703
ROC AUC,0.801161,0.770827,0.75915,0.706535,0.621061


In [61]:
print(overview.to_latex())

\begin{tabular}{lrrrrr}
\toprule
 & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
Logistic Regression & 0.899096 & 0.461903 & 0.388889 & 0.412032 & 0.801161 \\
SVM & 0.888781 & 0.400679 & 0.411111 & 0.405139 & 0.770827 \\
Naive Bayes & 0.761121 & 0.205177 & 0.533333 & 0.294140 & 0.759150 \\
Decision Tree & 0.888750 & 0.443947 & 0.433333 & 0.432849 & 0.706535 \\
K-Nearest Neighbors & 0.882612 & 0.364536 & 0.300000 & 0.319703 & 0.621061 \\
\bottomrule
\end{tabular}



## Evaluation

### Logistic Regression

In [62]:
pd.DataFrame(pf.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435
0,1,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Kidhome A_Marital_Sta...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Married_Teenhome A_Marital_St...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Kidhome A_Marital_Stat...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Single_Teenhome A_Marital_Sta...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Kidhome A_Marital_St...,A_Marital_Status_Together_Teenhome A_Marital_S...,A_Marital_Status_Together_Teenhome A_Marital_S...,A_Marital_Status_Widow_Kidhome A_Marital_Statu...


In [63]:
lr = clf_lr.best_estimator_

In [64]:
coefficients = pd.DataFrame(data=lr.coef_, columns=pf.get_feature_names_out())

In [65]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [66]:
pd.DataFrame(data=top5.values, index=top5.index)

Unnamed: 0,0
MntFruits A_Marital_Status_Single_Kidhome,0.96431
MntMeatProducts A_Marital_Status_Married_Teenhome,0.909317
MntWines A_Marital_Status_Together_Teenhome,0.904126
MntWines NumWebPurchases,0.85196
Education_Master A_Marital_Status_Single_Kidhome,0.77263


### SVM

In [77]:
svm = clf_svc.best_estimator_

In [78]:
coefficients = pd.DataFrame(data=svm.coef_, columns=pf.get_feature_names_out())

In [79]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [80]:
pd.DataFrame(top5)

Unnamed: 0,0
MntWines NumWebPurchases,0.904773
A_Marital_Status_Married_Kidhome A_Marital_Status_Married_Teenhome,0.804857
MntFruits A_Marital_Status_Single_Kidhome,0.803837
NumWebVisitsMonth A_Marital_Status_Single_Kidhome,0.76976
Recency Days_Since_Customer,0.733147


In [81]:
print(pd.DataFrame(top5).to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
MntWines NumWebPurchases & 0.904773 \\
A_Marital_Status_Married_Kidhome A_Marital_Status_Married_Teenhome & 0.804857 \\
MntFruits A_Marital_Status_Single_Kidhome & 0.803837 \\
NumWebVisitsMonth A_Marital_Status_Single_Kidhome & 0.769760 \\
Recency Days_Since_Customer & 0.733147 \\
\bottomrule
\end{tabular}



In [82]:
svecs = pd.DataFrame(sc.inverse_transform(svm.support_vectors_), columns=pf.get_feature_names_out())

In [114]:
print(svecs.mean())

1                                                                         1.000000
Year_Birth                                                             1970.030303
Income                                                                37390.267677
Recency                                                                  30.606061
MntWines                                                                101.590909
                                                                          ...     
A_Marital_Status_Together_Kidhome A_Marital_Status_Widow_Kidhome          0.000000
A_Marital_Status_Together_Kidhome A_Marital_Status_Widow_Teenhome         0.000000
A_Marital_Status_Together_Teenhome A_Marital_Status_Widow_Kidhome         0.000000
A_Marital_Status_Together_Teenhome A_Marital_Status_Widow_Teenhome        0.000000
A_Marital_Status_Widow_Kidhome A_Marital_Status_Widow_Teenhome            0.015152
Length: 436, dtype: float64


In [122]:
X_train.shape

(971, 436)

In [126]:
svecs.shape

(198, 436)

In [123]:
svm.n_support_

array([131,  67])

In [127]:
198/971

0.203913491246138

### Naive Bayes

In [130]:
nb = clf_nb.best_estimator_

In [149]:
df.value_counts('Response')

Response
0    1101
1     113
Name: count, dtype: int64

In [135]:
nb.class_log_prior_

array([-0.09726884, -2.3785168 ])

In [152]:
flp = pd.DataFrame(nb.feature_log_prob_, columns=pf.get_feature_names_out())

In [161]:
print(pd.DataFrame(flp.loc[0].sort_values(ascending=False).iloc[:5]).to_latex())

\begin{tabular}{lr}
\toprule
 & 0 \\
\midrule
Year_Birth & -0.624380 \\
Year_Birth Recency & -0.641402 \\
NumCatalogPurchases Days_Since_Customer & -0.650023 \\
NumCatalogPurchases & -0.650023 \\
Recency & -0.650023 \\
\bottomrule
\end{tabular}

