In [108]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model  import LogisticRegression, PassiveAggressiveClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

from statsmodels.stats.contingency_tables import mcnemar
from scikit_posthocs import posthoc_nemenyi_friedman, posthoc_wilcoxon

from tqdm import tqdm

models = [DummyClassifier, LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis, LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, SVC, RandomForestClassifier, ExtraTreesClassifier, 
          GaussianNB, MLPClassifier, PassiveAggressiveClassifier]

RANDOM_SEED: int = 42

In [2]:
len(models)

13

In [3]:
df = pd.read_csv('data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
X,y = df.drop('Class', axis=1), df.Class

In [5]:
# Stratification for balanced test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=.2, stratify=y)

In [6]:
N_FOLDS: int = 5

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [26]:
parallelize = [RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier]

In [32]:
results = np.zeros((len(models), N_FOLDS))
cur_fold = 0
np.random.seed(RANDOM_SEED)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    for i, clf in tqdm(enumerate(models)):
        if clf not in parallelize:
            clf = clf().fit(X_train_cv, y_train_cv)
        else:
             clf = clf(n_jobs=-1).fit(X_train_cv, y_train_cv)
        score = f1_score(y_val, clf.predict(X_val))
        results[i, cur_fold] = score
    cur_fold += 1
    

12it [01:38,  8.21s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
12it [01:12,  6.01s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
12it [01:23,  6.94s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessin

In [34]:
from scipy.stats import friedmanchisquare

In [38]:
friedmanchisquare(*results.tolist())

FriedmanchisquareResult(statistic=53.67937853107345, pvalue=1.3504298193596668e-07)

Here we can see that at keast 1 model is statistically significantly different that all others

In [39]:
! pip3 install scikit-posthocs


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-posthocs
  Downloading scikit_posthocs-0.7.0-py3-none-any.whl (38 kB)
Collecting statsmodels
  Downloading statsmodels-0.13.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m00:01[0m01[0m
Collecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels, scikit-posthocs
Successfully installed patsy-0.5.3 scikit-posthocs-0.7.0 statsmodels-0.13.5


In [107]:
results

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.8137931 , 0.83098592, 0.7972973 , 0.83870968, 0.79452055],
       [0.11774601, 0.11551724, 0.10500808, 0.124057  , 0.11083123],
       [0.66257669, 0.68874172, 0.725     , 0.7012987 , 0.67080745],
       [0.75324675, 0.78947368, 0.78431373, 0.70857143, 0.80519481],
       [0.09756098, 0.1627907 , 0.13953488, 0.13953488, 0.04938272],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.84285714, 0.84671533, 0.82758621, 0.84768212, 0.86111111],
       [0.84507042, 0.86330935, 0.85714286, 0.85333333, 0.85314685],
       [0.21029083, 0.25192802, 0.22171946, 0.25892857, 0.23614458],
       [0.55652174, 0.54545455, 0.24528302, 0.04878049, 0.23747681],
       [0.02409639, 0.        , 0.09638554, 0.        , 0.        ]])

In [56]:
results[1, :], results[4, :], results[-4, :],results[-5, :]

(array([0.8137931 , 0.83098592, 0.7972973 , 0.83870968, 0.79452055]),
 array([0.75324675, 0.78947368, 0.78431373, 0.70857143, 0.80519481]),
 array([0.84507042, 0.86330935, 0.85714286, 0.85333333, 0.85314685]),
 array([0.84285714, 0.84671533, 0.82758621, 0.84768212, 0.86111111]))

Let's compare best results

In [57]:
friedmanchisquare(results[1, :], results[4, :], results[-4, :],results[-5, :])

FriedmanchisquareResult(statistic=13.079999999999998, pvalue=0.004466756369997713)

In [60]:
next_mtr = np.array([results[1, :], results[4, :], results[-4, :],results[-5, :]])

In [61]:
posthoc_nemenyi_friedman(next_mtr.T)

Unnamed: 0,0,1,2,3
0,1.0,0.872678,0.068212,0.316064
1,0.872678,1.0,0.007913,0.068212
2,0.068212,0.007913,1.0,0.872678
3,0.316064,0.068212,0.872678,1.0


In [62]:
next_mtr

array([[0.8137931 , 0.83098592, 0.7972973 , 0.83870968, 0.79452055],
       [0.75324675, 0.78947368, 0.78431373, 0.70857143, 0.80519481],
       [0.84507042, 0.86330935, 0.85714286, 0.85333333, 0.85314685],
       [0.84285714, 0.84671533, 0.82758621, 0.84768212, 0.86111111]])

We can see that samples -4 and -5 are rougly equal

In [64]:
models[-4], models[-5]

(sklearn.ensemble._forest.ExtraTreesClassifier,
 sklearn.ensemble._forest.RandomForestClassifier)

Let's compare them on a test set

In [65]:
clf1 = ExtraTreesClassifier(n_jobs=-1).fit(X_train, y_train)
clf2 = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)

In [68]:
y_pred_clf1 = clf1.predict(X_test)
y_pred_clf2 = clf2.predict(X_test)

In [69]:
f1_score(y_test, y_pred_clf1)

0.8791208791208791

In [70]:
f1_score(y_test, y_pred_clf2)

0.8729281767955801

In [80]:
A = ((y_pred_clf1 == y_test) & (y_pred_clf2 == y_test)).sum()
B = ((y_pred_clf1 != y_test) & (y_pred_clf2 == y_test)).sum()
C = ((y_pred_clf1 == y_test) & (y_pred_clf2 != y_test)).sum()
D = ((y_pred_clf1 != y_test) & (y_pred_clf2 != y_test)).sum()

In [84]:
mcnemar([[A, B], [C, D]]).pvalue

1.0

Because models are not so different, let's choose the one with the best F1-score. 

In [86]:
results_df =  pd.DataFrame(results)

In [97]:
model_names = [x.__name__ for x in models]
results_df.index = model_names
results_df.columns = [f'fold_{i+1}' for i in range(N_FOLDS)]

In [99]:
results_df.to_csv('model_cmp.csv')

In [104]:
with open('data/X_train.pkl', 'wb') as f:
    pkl.dump(X_train, f)
with open('data/X_test.pkl', 'wb') as f:
    pkl.dump(X_test, f)
with open('data/y_train.pkl', 'wb') as f:
    pkl.dump(y_train, f)
with open('data/y_test.pkl', 'wb') as f:
    pkl.dump(y_test, f)