In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model  import LogisticRegression, PassiveAggressiveClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

from statsmodels.stats.contingency_tables import mcnemar
from scikit_posthocs import posthoc_nemenyi_friedman, posthoc_wilcoxon

from tqdm import tqdm

models = [DummyClassifier, LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, SVC, RandomForestClassifier, ExtraTreesClassifier, 
          GaussianNB, MLPClassifier, PassiveAggressiveClassifier]

RANDOM_SEED: int = 42

# Dataset

First we have to download dataset from Kaggle: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

The dataset contains transactions made by credit cards in September 2013 by European cardholders. 
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

Due to the fact we won't perform feature engineering within this project we don't care about understanding of feature nature. For these reasons anomymised (PCA transformed) features are totally fine to test our classification pipeline for tunning and choosing the best classification model.

In [2]:
df = pd.read_csv('data/winequality-red.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
X, y = df.drop('quality', axis=1), df.quality > 5

# Model evaluation

Lets define the common classification models from SK-Learn and split our dataset to train/val and test parts.

Train/val subset will be used for hyperparameter optmimization and test set for final evaluation.

Due to the fact we have unbalanced dataset, we want to apply stratification, so positive labels will be presented in both subsets in equal proportion.

Hyperparameter optimization is performed using cross-validation to have more robust evaluations.

In [13]:
print(f"Number of models: {len(models)}")
models

Number of models: 12


[sklearn.dummy.DummyClassifier,
 sklearn.discriminant_analysis.LinearDiscriminantAnalysis,
 sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis,
 sklearn.linear_model._logistic.LogisticRegression,
 sklearn.tree._classes.DecisionTreeClassifier,
 sklearn.neighbors._classification.KNeighborsClassifier,
 sklearn.svm._classes.SVC,
 sklearn.ensemble._forest.RandomForestClassifier,
 sklearn.ensemble._forest.ExtraTreesClassifier,
 sklearn.naive_bayes.GaussianNB,
 sklearn.neural_network._multilayer_perceptron.MLPClassifier,
 sklearn.linear_model._passive_aggressive.PassiveAggressiveClassifier]

In [14]:
# Stratification for balanced test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=.2, stratify=y)

In [15]:
N_FOLDS: int = 5

In [16]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [17]:
parallelize = [RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier]

In [None]:
results = np.zeros((len(models), N_FOLDS))
cur_fold = 0
np.random.seed(RANDOM_SEED)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    for i, clf in tqdm(enumerate(models)):
        if clf not in parallelize:
            clf = clf().fit(X_train_cv, y_train_cv)
        else:
             clf = clf(n_jobs=-1).fit(X_train_cv, y_train_cv)
        score = f1_score(y_val, clf.predict(X_val))
        results[i, cur_fold] = score
    cur_fold += 1
    

# Statistical testing



In [19]:
from scipy.stats import friedmanchisquare

In [20]:
result = friedmanchisquare(*results.tolist())
print(result)

FriedmanchisquareResult(statistic=44.96923076923079, pvalue=4.913003380500927e-06)


*   H0: both models have the same performance
*   H1: performances of the two models are not equal

In [21]:
p = result.pvalue
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

Different distributions (reject H0)


Here we can see that at least 1 model is statistically significantly different that all others

In [None]:
! pip3 install scikit-posthocs

In [25]:
results.mean(1)

array([0.53846785, 0.75842965, 0.74635644, 0.75351067, 0.75196464,
       0.68163869, 0.71190214, 0.80943059, 0.81464809, 0.72994333,
       0.74758059, 0.6983464 ])

In [28]:
results[7, :], results[8, :], results[1, :], results[3, :], results[4, :]

(array([0.82706767, 0.78228782, 0.79699248, 0.81850534, 0.82229965]),
 array([0.85818182, 0.76978417, 0.8       , 0.8172043 , 0.82807018]),
 array([0.78490566, 0.75655431, 0.75812274, 0.74074074, 0.75182482]),
 array([0.79104478, 0.75636364, 0.72857143, 0.73880597, 0.75276753]),
 array([0.79411765, 0.75601375, 0.69402985, 0.76388889, 0.75177305]))

Let's compare best results

In [31]:
result = friedmanchisquare(results[7, :], results[8, :], results[1, :], results[3, :], results[4, :])
result

FriedmanchisquareResult(statistic=15.200000000000003, pvalue=0.004303882327589247)

In [32]:
p = result.pvalue
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

Different distributions (reject H0)


In [33]:
next_mtr = np.array([results[7, :], results[8, :], results[1, :], results[3, :], results[4, :]])

In [34]:
posthoc_nemenyi_friedman(next_mtr.T)

Unnamed: 0,0,1,2,3,4
0,1.0,0.9,0.179466,0.1154,0.070476
1,0.9,1.0,0.1154,0.070476,0.040902
2,0.179466,0.1154,1.0,0.9,0.9
3,0.1154,0.070476,0.9,1.0,0.9
4,0.070476,0.040902,0.9,0.9,1.0


In [35]:
next_mtr

array([[0.82706767, 0.78228782, 0.79699248, 0.81850534, 0.82229965],
       [0.85818182, 0.76978417, 0.8       , 0.8172043 , 0.82807018],
       [0.78490566, 0.75655431, 0.75812274, 0.74074074, 0.75182482],
       [0.79104478, 0.75636364, 0.72857143, 0.73880597, 0.75276753],
       [0.79411765, 0.75601375, 0.69402985, 0.76388889, 0.75177305]])

We can see that samples 7 and 8 are rougly equal

In [36]:
models[7], models[8]

(sklearn.ensemble._forest.RandomForestClassifier,
 sklearn.ensemble._forest.ExtraTreesClassifier)

Let's compare them on a test set

In [37]:
clf1 = ExtraTreesClassifier(n_jobs=-1).fit(X_train, y_train)
clf2 = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)

In [38]:
y_pred_clf1 = clf1.predict(X_test)
y_pred_clf2 = clf2.predict(X_test)

In [39]:
f1_score(y_test, y_pred_clf1)

0.8150289017341041

In [40]:
f1_score(y_test, y_pred_clf2)

0.8173913043478261

In [41]:
A = ((y_pred_clf1 == y_test) & (y_pred_clf2 == y_test)).sum()
B = ((y_pred_clf1 != y_test) & (y_pred_clf2 == y_test)).sum()
C = ((y_pred_clf1 == y_test) & (y_pred_clf2 != y_test)).sum()
D = ((y_pred_clf1 != y_test) & (y_pred_clf2 != y_test)).sum()

In [42]:
B + C

19

In [43]:
mcnemar([[A, B], [C, D]]).pvalue

1.0

Because models are not so different, let's choose the one with the best F1-score. 

In [45]:
results_df = pd.DataFrame(results)

In [46]:
model_names = [x.__name__ for x in models]
results_df.index = model_names
results_df.columns = [f'fold_{i+1}' for i in range(N_FOLDS)]
results_df.to_csv('model_cmp_wine.csv')