In [83]:
import common_datasets.binary_classification as binclas
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc

from common_datasets.binary_classification import summary_pdf

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

In [84]:
def generate_random_classifier(random_state):
    mode = random_state.randint(4)
    if mode == 0:
        classifier = RandomForestClassifier
        params = {'max_depth': random_state.randint(2, 10),
                  'random_state': 5}
    if mode == 1:
        classifier = DecisionTreeClassifier
        params = {'max_depth': random_state.randint(2, 10),
                  'random_state': 5}
    if mode == 2:
        classifier = SVC
        params = {'probability': True, 'C': random_state.rand()/2 + 0.001}
    if mode == 3:
        classifier = KNeighborsClassifier
        params = {'n_neighbors': random_state.randint(2, 10)}
    
    return (classifier, params)

In [85]:
datasets = binclas.get_filtered_data_loaders(n_col_bounds=(0, 50), n_bounds=(0, 2000), n_minority_bounds=(20, 1000), n_from_phenotypes=1, imbalance_ratio_bounds=(0.2, 20.0))

In [86]:
len(datasets)

28

In [87]:
names = [dataset()['name'] for dataset in datasets if not dataset()['name'].startswith('led')]

In [88]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [89]:
tmp.index = [idx for idx in range(1, 28)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & abalone9 18 \cite{keel} & 731 & 9 & 42 & 689 & 16.40 \\
2 & appendicitis \cite{keel} & 106 & 7 & 21 & 85 & 4.05 \\
3 & australian \cite{keel} & 690 & 16 & 307 & 383 & 1.25 \\
4 & bupa \cite{keel} & 345 & 6 & 145 & 200 & 1.38 \\
5 & CM1 \cite{krnn} & 498 & 21 & 49 & 449 & 9.16 \\
6 & crx \cite{keel} & 653 & 37 & 296 & 357 & 1.21 \\
7 & dermatology-6 \cite{keel} & 358 & 34 & 20 & 338 & 16.90 \\
8 & ecoli1 \cite{keel} & 336 & 7 & 77 & 259 & 3.36 \\
9 & glass0 \cite{keel} & 214 & 9 & 70 & 144 & 2.06 \\
10 & haberman \cite{keel} & 306 & 3 & 81 & 225 & 2.78 \\
11 & hepatitis \cite{krnn} & 155 & 19 & 32 & 123 & 3.84 \\
12 & ionosphere \cite{keel} & 351 & 33 & 126 & 225 & 1.79 \\
13 & iris0 \cite{keel} & 150 & 4 & 50 & 100 & 2.00 \\
14 & mammographic \cite{keel} & 830 & 5 & 403 & 427 & 1.06 \\
15 & monk-2 \cite{keel} & 432 & 6 & 204 & 228 & 1.12 \\
16 & new thyroid1 \cite{keel} & 215 & 5 & 35 & 180 & 

In [90]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [91]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [92]:
N_SAMPLES = 10_000

random_state = np.random.RandomState(5)
dropped = 0
results = []

while len(results) < N_SAMPLES:
    record = {}

    loader = random_state.choice(datasets)
    dataset = loader()

    X = dataset['data']
    y = dataset['target']
    name = dataset['name']

    record['dataset'] = name

    if random_state.randint(2) == 0:
        y = 1 - y

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5, stratify=y)

    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)

    classifier = generate_random_classifier(random_state)

    record['classifier'] = classifier[0].__name__
    record['classifier_params'] = str(classifier[1])

    classifier_obj = classifier[0](**classifier[1])

    classifier_obj.fit(X_train, y_train)

    if classifier[0].__name__ == 'SVC' and classifier_obj.fit_status_ == 1:
        print('failed SVC')
        continue

    record = record | {
        'p': np.sum(y_test), 
        'n': len(y_test) - np.sum(y_test), 
        'p_train': np.sum(y_train), 
        'n_train': len(y_train) - np.sum(y_train)
    }

    y_pred = classifier_obj.predict_proba(X_test)[:, 1]
    y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

    auc = roc_auc_score(y_test, y_pred)
    auc_train = roc_auc_score(y_train, y_pred_train)

    record = record | {'auc': auc, 'auc_train': auc_train}

    fpr, tpr, thresholds = simplify_roc(*roc_curve(y_test, y_pred))
    fpr_train, tpr_train, thresholds_train = simplify_roc(*roc_curve(y_train, y_pred_train))

    record = record | {
        'fprs': str(fpr.tolist()), 
        'tprs': str(tpr.tolist()),
        'thresholds': str(thresholds.tolist()),
        'fprs_train': str(fpr_train.tolist()), 
        'tprs_train': str(tpr_train.tolist()),
        'thresholds_train': str(thresholds_train.tolist()),
        }

    if auc < 0.5:
        dropped += 1
        continue

    record = record | calculate_scores(y_test, y_pred, y_train, random_state)
    record = record | calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

    results.append(record)

In [93]:
dropped

50

In [94]:
data = pd.DataFrame.from_dict(results)

In [95]:
data.iloc[1]

dataset                                               dermatology-6
classifier                                   RandomForestClassifier
classifier_params               {'max_depth': 3, 'random_state': 5}
p                                                                68
n                                                                 4
p_train                                                         270
n_train                                                          16
auc                                                             1.0
auc_train                                                       1.0
fprs                                                [0.0, 0.0, 1.0]
tprs                                                [0.0, 1.0, 1.0]
thresholds           [inf, 0.9064396244236703, 0.16932266702168314]
fprs_train                                          [0.0, 0.0, 1.0]
tprs_train                                          [0.0, 1.0, 1.0]
thresholds_train      [inf, 0.8652149609988417, 

In [96]:
data.head()

Unnamed: 0,dataset,classifier,classifier_params,p,n,p_train,n_train,auc,auc_train,fprs,...,spec,best_acc,best_sens,best_spec,acc_train,sens_train,spec_train,best_acc_train,best_sens_train,best_spec_train
0,bupa,KNeighborsClassifier,{'n_neighbors': 7},40,29,160,116,0.743966,0.799865,"[0.0, 0.0, 0.10344827586206896, 0.275862068965...",...,0.724138,0.724638,0.875,0.517241,0.666667,0.525,0.862069,0.757246,0.8625,0.612069
1,dermatology-6,RandomForestClassifier,"{'max_depth': 3, 'random_state': 5}",68,4,270,16,1.0,1.0,"[0.0, 0.0, 1.0]",...,1.0,1.0,1.0,1.0,0.968531,0.966667,1.0,1.0,1.0,1.0
2,glass0,KNeighborsClassifier,{'n_neighbors': 8},29,14,115,56,0.818966,0.925621,"[0.0, 0.0, 0.07142857142857142, 0.214285714285...",...,0.785714,0.813953,0.862069,0.714286,0.777778,0.704348,0.928571,0.847953,0.817391,0.910714
3,new_thyroid1,RandomForestClassifier,"{'max_depth': 9, 'random_state': 5}",36,7,144,28,1.0,1.0,"[0.0, 0.0, 1.0]",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,iris0,DecisionTreeClassifier,"{'max_depth': 7, 'random_state': 5}",10,20,40,80,1.0,1.0,"[0.0, 0.0, 1.0]",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [97]:
data.to_csv('raw-single4.csv', index=False)