In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [4]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [5]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [6]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [7]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [8]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

Unnamed: 0,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Total_Children,Days_Since_Customer,Education_Basic,Education_Graduation,Education_Master,Education_PhD,A_Marital_Status_Married_Kidhome,A_Marital_Status_Married_Teenhome,A_Marital_Status_Single_Kidhome,A_Marital_Status_Single_Teenhome,A_Marital_Status_Together_Kidhome,A_Marital_Status_Together_Teenhome,A_Marital_Status_Widow_Kidhome,A_Marital_Status_Widow_Teenhome
0,1961,57091.0,0,464,5,64,7,0,37,1,7,3,7,5,0,0,3542,False,True,False,False,0,0,0,0,0,0,0,0
1,1958,67267.0,0,134,11,59,15,2,30,1,3,2,5,2,0,1,3575,False,True,False,False,0,1,0,0,0,0,0,0
2,1967,32474.0,0,10,0,1,0,0,0,1,1,0,2,7,0,2,3399,False,True,False,False,0,0,0,0,1,1,0,0
3,1989,21474.0,0,6,16,24,11,0,34,2,3,1,2,7,0,1,3492,False,True,False,False,0,0,1,0,0,0,0,0
4,1967,44931.0,0,78,0,11,0,0,7,1,2,1,3,5,0,1,3690,False,True,False,False,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,1974,20130.0,99,0,6,3,7,6,12,1,1,0,3,8,0,0,3632,False,False,False,False,0,0,0,0,0,0,0,0
1210,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1211,1973,42429.0,99,55,0,6,2,0,4,2,1,1,3,5,0,1,3402,False,True,False,False,0,0,0,0,0,0,0,1
1212,1977,31056.0,99,5,10,13,3,8,16,1,1,0,3,8,0,1,4051,False,False,False,False,1,0,0,0,0,0,0,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
pf = PolynomialFeatures()
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [13]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [14]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [15]:
params = {
    'max_iter': [100, 200, 500, 1000]
}

In [16]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [17]:
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
clf_lr.best_estimator_

In [19]:
pd.DataFrame(clf_lr.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.113347,0.026037,0.002773,0.000765,100,{'max_iter': 100},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
1,0.089486,0.007531,0.002501,0.000446,200,{'max_iter': 200},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
2,0.081175,0.003571,0.001797,0.000399,500,{'max_iter': 500},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1
3,0.086799,0.011396,0.002242,0.000471,1000,{'max_iter': 1000},0.439024,0.432432,0.411765,0.384615,0.4,0.413567,0.020147,1


In [20]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Logistic Regression,0.899091,0.413567,0.474971,0.771246,0.388889


In [21]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### SVM


In [22]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [23]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [24]:
clf_svc.fit(X_train, y_train)

In [25]:
clf_svc.best_estimator_

In [26]:
pd.DataFrame(clf_svc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.036921,0.012915,0.006748,0.000329,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.031393,0.002672,0.014288,0.000301,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.021961,0.002675,0.00468,0.000339,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.032944,0.002184,0.006841,0.000435,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.153846,0.0,0.083333,0.0,0.166667,0.080769,0.071795,1
4,0.036198,0.002137,0.01668,0.00049,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
5,0.015983,0.000498,0.004667,0.000512,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [27]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
SVM,0.8929,0.080769,0.15,0.730092,0.055556


In [28]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes


In [29]:
clf_nb = GaussianNB()

In [30]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb, 'Naive Bayes')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Naive Bayes,0.29456,0.171189,0.096033,0.536592,0.788889


In [31]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [32]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}

In [33]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [34]:

clf_dt.fit(X_train, y_train)

In [35]:
clf_dt.best_estimator_

In [36]:
pd.DataFrame(clf_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.178863,0.042877,0.002013,1.6e-05,gini,best,"{'criterion': 'gini', 'splitter': 'best'}",0.388889,0.45,0.418605,0.137931,0.344828,0.34805,0.110634,4
1,0.02257,0.00169,0.001758,0.000456,gini,random,"{'criterion': 'gini', 'splitter': 'random'}",0.352941,0.326531,0.27027,0.388889,0.4,0.347726,0.046696,5
2,0.099031,0.00419,0.002016,2e-05,entropy,best,"{'criterion': 'entropy', 'splitter': 'best'}",0.324324,0.486486,0.368421,0.5625,0.263158,0.400978,0.108932,2
3,0.01483,0.000915,0.001928,0.000496,entropy,random,"{'criterion': 'entropy', 'splitter': 'random'}",0.4,0.421053,0.315789,0.470588,0.258065,0.373099,0.076216,3
4,0.095657,0.005496,0.001716,0.000395,log_loss,best,"{'criterion': 'log_loss', 'splitter': 'best'}",0.3,0.473684,0.368421,0.588235,0.333333,0.412735,0.10535,1
5,0.015892,0.000883,0.001959,0.000515,log_loss,random,"{'criterion': 'log_loss', 'splitter': 'random'}",0.421053,0.358974,0.341463,0.235294,0.344828,0.340322,0.059863,6


In [37]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
Decision Tree,0.884679,0.369142,0.360985,0.6859,0.411111


In [38]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [39]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [40]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [41]:
clf_kn.fit(X_train, y_train)

In [42]:
clf_kn.best_estimator_

In [43]:
pd.DataFrame(clf_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002117,0.000805,0.073203,0.125311,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.277778,0.352941,0.181818,0.296296,0.375,0.296767,0.067602,5
1,0.002103,0.0002,0.01033,0.000594,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.277778,0.352941,0.181818,0.296296,0.375,0.296767,0.067602,5
2,0.002004,4e-06,0.009863,0.00095,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.095238,0.1,0.095238,0.1,0.272727,0.132641,0.070076,25
3,0.001864,0.00045,0.010458,0.001294,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.277778,0.352941,0.1875,0.296296,0.375,0.297903,0.06568,1
4,0.001939,0.00043,0.009971,0.001132,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.0,0.090909,0.0,0.105263,0.2,0.079234,0.074772,29
5,0.002105,0.0002,0.010973,0.000618,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.230769,0.296296,0.1,0.190476,0.363636,0.236236,0.090025,13
6,0.002005,6e-06,0.00994,0.000873,auto,7,uniform,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,33
7,0.002007,7e-06,0.010935,0.001291,auto,7,distance,"{'algorithm': 'auto', 'n_neighbors': 7, 'weigh...",0.25,0.25,0.105263,0.1,0.2,0.181053,0.066603,21
8,0.002209,0.00099,0.010847,0.001069,auto,10,uniform,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,33
9,0.001802,0.000422,0.010972,0.001209,auto,10,distance,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.25,0.333333,0.105263,0.1,0.2,0.197719,0.088574,17


In [44]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

Unnamed: 0,Accuracy,F1 Score,Precision,ROC AUC,Recall
K-Nearest Neighbors,0.883643,0.297903,0.348016,0.6289,0.266667


In [45]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [46]:
overview

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.899091,0.474971,0.388889,0.413567,0.771246
SVM,0.8929,0.15,0.055556,0.080769,0.730092
Naive Bayes,0.29456,0.096033,0.788889,0.171189,0.536592
Decision Tree,0.87954,0.377647,0.377778,0.424394,0.675875
K-Nearest Neighbors,0.883643,0.348016,0.266667,0.297903,0.6289
