In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
from sklearn.preprocessing import *
from sklearn.model_selection import *

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [None]:
df = pd.read_parquet('./parquet/preprocessed.parquet')

Drop unnecessary columns


In [None]:
df = df.drop(['ID', 'Dt_Customer', 'Kidhome', 'Teenhome'],
             axis=1).reset_index(drop=True)

In [None]:
columns = df.columns
columns_to_drop = [col for col in columns if col.startswith('Marital_')]
df = df.drop(columns=columns_to_drop)

## Modelling


In [None]:
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.linear_model import *

In [None]:
X = df.drop('Response', axis=1)
y = df.Response

pd.set_option('display.max_columns', 50)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=12)

In [None]:
pf = PolynomialFeatures()
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [None]:
def assess_cv(model, name):
    accuracy = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='accuracy')
    precision = cross_val_score(
        model, X_train, y_train, cv=skf, scoring='precision')
    recall = cross_val_score(model, X_train, y_train, cv=skf, scoring='recall')
    f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')
    roc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')

    eval = {name: {} for n in range(5)}
    eval[name]['Accuracy'] = accuracy.mean()
    eval[name]['Precision'] = precision.mean()
    eval[name]['Recall'] = recall.mean()
    eval[name]['F1 Score'] = f1.mean()
    eval[name]['ROC AUC'] = roc.mean()
    return pd.DataFrame(eval).T

In [None]:
overview = pd.DataFrame({
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': [],
    'ROC AUC': []
})

### Logistic Regression


In [None]:
params = {
    'max_iter': [50, 70, 100, 200, 500, 1000, 2000, 5000]
}

In [None]:
clf_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [None]:
clf_lr.fit(X_train, y_train)

In [None]:
clf_lr.best_estimator_

In [None]:
pd.DataFrame(clf_lr.cv_results_)

In [None]:
print(pd.DataFrame(clf_lr.cv_results_)[['param_max_iter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

In [None]:
# scores = cross_val_score(clf_lr.best_estimator_, X_train, y_train, cv=skf, scoring='roc_auc')
assess_cv(clf_lr.best_estimator_, 'Logistic Regression')

In [None]:
overview = pd.concat([overview, assess_cv(
    clf_lr.best_estimator_, 'Logistic Regression')])

### SVM


In [None]:
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [None]:
clf_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [None]:
clf_svc.fit(X_train, y_train)

In [None]:
clf_svc.best_estimator_

In [None]:
pd.DataFrame(clf_svc.cv_results_)

In [None]:
print(pd.DataFrame(clf_svc.cv_results_)[['param_gamma', 'param_kernel', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

In [None]:
# scores = cross_val_score(clf_svc.best_estimator_, X_train, y_train, cv=skf, scoring='precision')
assess_cv(clf_svc.best_estimator_, 'SVM')

In [None]:
overview = pd.concat([overview, assess_cv(clf_svc.best_estimator_, 'SVM')])

### Naive Bayes


In [None]:
params = {
    'alpha': [1.0, 2.0, 5.0, 7, 10],
    'fit_prior': [True, False],
}

In [None]:
clf_nb = GridSearchCV(
    estimator=BernoulliNB(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [None]:
clf_nb.fit(X_train, y_train)

In [None]:
clf_nb.best_estimator_

In [None]:
pd.DataFrame(clf_nb.cv_results_)

In [None]:
print(pd.DataFrame(clf_nb.cv_results_)[['param_alpha', 'param_fit_prior', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

In [None]:
# scores = cross_val_score(clf_nb, X_train, y_train, cv=skf, scoring='f1')
assess_cv(clf_nb.best_estimator_, 'Naive Bayes')

In [None]:
tt = pd.concat([assess_cv(clf_nb.best_estimator_, 'Bernoulli NB'), assess_cv(GaussianNB(), 'Gaussian NB')])

In [None]:
print(tt.to_latex())

In [None]:
tt

In [None]:
overview = pd.concat([overview, assess_cv(clf_nb, 'Naive Bayes')])

### Decision Tree


In [None]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10]
}

In [None]:
clf_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [None]:

clf_dt.fit(X_train, y_train)

In [None]:
clf_dt.best_estimator_

In [None]:
pd.DataFrame(clf_dt.cv_results_)

In [None]:
print(pd.DataFrame(clf_dt.cv_results_)[['param_criterion', 'param_min_samples_split', 'param_splitter', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

In [None]:
assess_cv(clf_dt.best_estimator_, 'Decision Tree')

In [None]:
overview = pd.concat([overview, assess_cv(
    clf_dt.best_estimator_, 'Decision Tree')])

### K-Nearest Neighbors


In [None]:
params = {
    'n_neighbors': [1, 2, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [None]:
clf_kn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    scoring='f1',
    cv=skf
)

In [None]:
clf_kn.fit(X_train, y_train)

In [None]:
clf_kn.best_estimator_

In [None]:
pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')

In [None]:
print(pd.DataFrame(clf_kn.cv_results_).sort_values(by='rank_test_score')[['param_algorithm', 'param_n_neighbors', 'param_weights', 'mean_test_score', 'rank_test_score']].to_latex(index=False))

In [None]:
assess_cv(clf_kn.best_estimator_, 'K-Nearest Neighbors')

In [None]:
overview = pd.concat([overview, assess_cv(
    clf_kn.best_estimator_, 'K-Nearest Neighbors')])

In [None]:
overview

In [None]:
overview.T

In [None]:
print(overview.to_latex())

## Evaluation

### Logistic Regression

In [None]:
pd.DataFrame(pf.get_feature_names_out()).T

In [None]:
lr = clf_lr.best_estimator_

In [None]:
coefficients = pd.DataFrame(data=lr.coef_, columns=pf.get_feature_names_out())

In [None]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [94]:
pd.DataFrame(data=top5.values, index=top5.index)

Unnamed: 0,0
MntFruits A_Marital_Status_Single_Kidhome,0.845264
MntWines NumWebPurchases,0.762868
NumWebVisitsMonth A_Marital_Status_Single_Kidhome,0.70227
MntSweetProducts A_Marital_Status_Together_Kidhome,0.678876
MntMeatProducts A_Marital_Status_Married_Teenhome,0.670549


### SVM

In [None]:
svm = clf_svc.best_estimator_

In [109]:
svm.support_vectors_

array([[ 0.        , -0.02639947,  0.03496485, ..., -0.1070436 ,
        -0.09672388, -0.15919544],
       [ 0.        ,  0.15348742,  0.56192764, ..., -0.1070436 ,
        -0.09672388, -0.15919544],
       [ 0.        , -0.02639947, -0.32773975, ..., -0.1070436 ,
        -0.09672388, -0.15919544],
       ...,
       [ 0.        ,  0.33337431, -0.56311788, ..., -0.1070436 ,
        -0.09672388, -0.15919544],
       [ 0.        ,  0.78309152, -1.62281185, ..., -0.1070436 ,
        -0.09672388, -0.15919544],
       [ 0.        , -1.19566422,  3.07138484, ..., -0.1070436 ,
        -0.09672388, -0.15919544]])

In [None]:
coefficients = pd.DataFrame(data=svm.coef_, columns=pf.get_feature_names_out())

In [None]:
top5 = coefficients.loc[0].sort_values(ascending=False).iloc[:5]

In [95]:
pd.DataFrame(top5)

Unnamed: 0,0
MntFruits A_Marital_Status_Single_Kidhome,0.845264
MntWines NumWebPurchases,0.762868
NumWebVisitsMonth A_Marital_Status_Single_Kidhome,0.70227
MntSweetProducts A_Marital_Status_Together_Kidhome,0.678876
MntMeatProducts A_Marital_Status_Married_Teenhome,0.670549


In [None]:
print(pd.DataFrame(top5).to_latex())