In [251]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [252]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [253]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [254]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [255]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [256]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [257]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [258]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

Unnamed: 0,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,A_Marital_Status_Single_Teenhome,A_Marital_Status_Together_Kidhome,A_Marital_Status_Together_Teenhome,A_Marital_Status_Widow_Kidhome,A_Marital_Status_Widow_Teenhome
0,1961,57091.0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,3542,False,True,False,False,0,0,0,0,0,0,0,0
1,1958,67267.0,0,134,11,59,15,2,30,1,3,2,5,2,0,1,3575,False,True,False,False,0,1,0,0,0,0,0,0
2,1967,32474.0,0,10,0,1,0,0,0,1,1,0,2,7,0,2,3399,False,True,False,False,0,0,0,0,1,1,0,0
3,1989,21474.0,0,6,16,24,11,0,34,2,3,1,2,7,0,1,3492,False,True,False,False,0,0,1,0,0,0,0,0
4,1967,44931.0,0,78,0,11,0,0,7,1,2,1,3,5,0,1,3690,False,True,False,False,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,1974,20130.0,99,0,6,3,7,6,12,1,1,0,3,8,0,0,3632,False,False,False,False,0,0,0,0,0,0,0,0
1210,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1211,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1212,1977,31056.0,99,5,10,13,3,8,16,1,1,0,3,8,0,1,4051,False,False,False,False,1,0,0,0,0,0,0,0


In [259]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [260]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [261]:
pf = PolynomialFeatures()
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [262]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [263]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [264]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [265]:
params = {
    'max_iter': [100, 200, 500, 1000]
}

In [266]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [267]:
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [268]:
clf_lr.best_estimator_

In [269]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.075731,0.007415,0.002397,0.000489,100,{'max_iter': 100},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
1,0.080148,0.004587,0.002399,0.000495,200,{'max_iter': 200},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
2,0.091846,0.02368,0.001999,3e-06,500,{'max_iter': 500},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
3,0.079039,0.002831,0.001998,4e-06,1000,{'max_iter': 1000},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1


In [270]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.899091,0.413567,0.474971,0.771246,0.388889


In [271]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### SVM


In [272]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [273]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [274]:
clf_svc.fit(X_train, y_train)

In [275]:
clf_svc.best_estimator_

In [276]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.032104,0.003201,0.00711,0.000477,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.034833,0.001002,0.013625,0.000501,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.024991,0.001359,0.004796,0.000392,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.036982,0.001596,0.006603,0.000481,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.153846,0.0,0.083333,0.0,0.166667,0.080769,0.071795,1
4,0.040093,0.008065,0.016011,2.4e-05,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
5,0.018111,0.00142,0.004609,0.000492,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [277]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.8929,0.080769,0.15,0.730092,0.055556


In [278]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes


In [279]:
clf_nb = GaussianNB()

In [280]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.29456,0.171189,0.096033,0.536592,0.788889


In [281]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [282]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}

In [283]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [284]:

clf_dt.fit(X_train, y_train)

In [285]:
clf_dt.best_estimator_

In [286]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.152597,0.029653,0.001804,0.000397,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.411765,0.307692,0.341463,0.3125,0.285714,0.331827,0.043739,6
1,0.019131,0.00186,0.001995,1e-05,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.390244,0.4,0.315789,0.294118,0.324324,0.344895,0.042289,5
2,0.093009,0.00443,0.001606,0.00049,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.307692,0.540541,0.324324,0.526316,0.294118,0.398598,0.110595,3
3,0.014913,0.000493,0.002005,1.1e-05,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.4,0.487805,0.451613,0.222222,0.242424,0.360813,0.108748,4
4,0.092581,0.003633,0.001799,0.000394,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.307692,0.487805,0.45,0.529412,0.277778,0.410537,0.099861,1
5,0.01508,0.000493,0.001838,0.000424,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.606061,0.390244,0.4,0.352941,0.294118,0.408673,0.105457,2


In [287]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.893941,0.376185,0.425072,0.640521,0.433333


In [288]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [289]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [290]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [291]:
clf_kn.fit(X_train, y_train)

In [292]:
clf_kn.best_estimator_

In [293]:
pd.DataFrame(clf_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002206,0.000403,0.011302,0.000873,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.277778,0.352941,0.181818,0.296296,0.375,0.296767,0.067602,5
1,0.0026,0.00049,0.0126,0.001959,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.277778,0.352941,0.181818,0.296296,0.375,0.296767,0.067602,5
2,0.001601,0.000489,0.010103,0.000797,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.095238,0.1,0.095238,0.1,0.272727,0.132641,0.070076,25
3,0.0018,0.0004,0.010707,0.000614,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.277778,0.352941,0.1875,0.296296,0.375,0.297903,0.06568,1
4,0.001408,0.000489,0.010794,0.000979,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.0,0.090909,0.0,0.105263,0.2,0.079234,0.074772,29
5,0.002404,0.000487,0.012703,0.000742,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.230769,0.296296,0.1,0.190476,0.363636,0.236236,0.090025,13
6,0.002205,0.000745,0.010897,0.000801,auto,7,uniform,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,33
7,0.002001,3e-06,0.011803,0.001725,auto,7,distance,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.25,0.25,0.105263,0.1,0.2,0.181053,0.066603,21
8,0.001803,0.00075,0.011103,0.001116,auto,10,uniform,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,33
9,0.002003,5e-06,0.0119,0.001199,auto,10,distance,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.25,0.333333,0.105263,0.1,0.2,0.197719,0.088574,17


In [294]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.883643,0.297903,0.348016,0.6289,0.266667


In [295]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [296]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.899091,0.474971,0.388889,0.413567,0.771246
SVM,0.8929,0.15,0.055556,0.080769,0.730092
Naive Bayes,0.29456,0.096033,0.788889,0.171189,0.536592
Decision Tree,0.88571,0.395556,0.4,0.424103,0.662937
K-Nearest Neighbors,0.883643,0.348016,0.266667,0.297903,0.6289
