In [77]:
import common_datasets.binary_classification as binclas
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc, average_n_roc_curves

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

In [78]:
def generate_random_classifier(random_state):
    mode = random_state.randint(4)
    if mode == 0:
        classifier = RandomForestClassifier
        params = {'max_depth': random_state.randint(2, 10),
                  'random_state': 5}
    if mode == 1:
        classifier = DecisionTreeClassifier
        params = {'max_depth': random_state.randint(2, 10),
                  'random_state': 5}
    if mode == 2:
        classifier = SVC
        params = {'probability': True, 'C': random_state.rand()/2 + 0.001, 'random_state': 5}
    if mode == 3:
        classifier = KNeighborsClassifier
        params = {'n_neighbors': random_state.randint(2, 10)}
    
    return (classifier, params)

In [79]:
datasets = binclas.get_filtered_data_loaders(n_col_bounds=(0, 50), n_bounds=(0, 2000), n_minority_bounds=(20, 1000), n_from_phenotypes=1, imbalance_ratio_bounds=(0.2, 20.0))

In [80]:
len(datasets)

28

In [81]:
names = [dataset()['name'] for dataset in datasets if not dataset()['name'].startswith('led')]

In [82]:
from common_datasets.binary_classification import summary_pdf

In [83]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [84]:
tmp.index = [idx for idx in range(1, 28)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & abalone9 18 \cite{keel} & 731 & 9 & 42 & 689 & 16.40 \\
2 & appendicitis \cite{keel} & 106 & 7 & 21 & 85 & 4.05 \\
3 & australian \cite{keel} & 690 & 16 & 307 & 383 & 1.25 \\
4 & bupa \cite{keel} & 345 & 6 & 145 & 200 & 1.38 \\
5 & CM1 \cite{krnn} & 498 & 21 & 49 & 449 & 9.16 \\
6 & crx \cite{keel} & 653 & 37 & 296 & 357 & 1.21 \\
7 & dermatology-6 \cite{keel} & 358 & 34 & 20 & 338 & 16.90 \\
8 & ecoli1 \cite{keel} & 336 & 7 & 77 & 259 & 3.36 \\
9 & glass0 \cite{keel} & 214 & 9 & 70 & 144 & 2.06 \\
10 & haberman \cite{keel} & 306 & 3 & 81 & 225 & 2.78 \\
11 & hepatitis \cite{krnn} & 155 & 19 & 32 & 123 & 3.84 \\
12 & ionosphere \cite{keel} & 351 & 33 & 126 & 225 & 1.79 \\
13 & iris0 \cite{keel} & 150 & 4 & 50 & 100 & 2.00 \\
14 & mammographic \cite{keel} & 830 & 5 & 403 & 427 & 1.06 \\
15 & monk-2 \cite{keel} & 432 & 6 & 204 & 228 & 1.12 \\
16 & new thyroid1 \cite{keel} & 215 & 5 & 35 & 180 & 

In [85]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [86]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'threshold{label}': threshold,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec,
        f'best_threshold{label}': best_th
    }

In [87]:
def best_aggregated_scores(y_trues, y_preds, random_state, label=''):
    thresholds = np.unique(np.hstack(y_trues))

    best_ths = []
    best_acc = 0
    for th in thresholds.tolist() + [np.inf, -np.inf]:
        acc_tmp = np.mean([acc_sens_spec_at_th(y_true, y_pred, th)[0] for y_true, y_pred in zip(y_trues, y_preds)])

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    scores = [acc_sens_spec_at_th(y_true, y_pred, best_th) for y_true, y_pred in zip(y_trues, y_preds)]

    best_acc = np.mean([item[0] for item in scores])
    best_sens = np.mean([item[1] for item in scores])
    best_spec = np.mean([item[2] for item in scores])

    return {
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [88]:
def process_results(fold_results, random_state):
    results = {}

    results = results | best_aggregated_scores(
        [record['y_test'] for record in fold_results],
        [record['y_pred'] for record in fold_results],
        random_state
    )

    results = results | best_aggregated_scores(
        [record['y_train'] for record in fold_results],
        [record['y_pred_train'] for record in fold_results],
        random_state,
        label='_train'
    )

    results = results | {
        'acc': np.mean([record['acc'] for record in fold_results]),
        'sens': np.mean([record['sens'] for record in fold_results]),
        'spec': np.mean([record['spec'] for record in fold_results]),
        'auc': np.mean([record['auc'] for record in fold_results]),
        'acc_train': np.mean([record['acc_train'] for record in fold_results]),
        'sens_train': np.mean([record['sens_train'] for record in fold_results]),
        'spec_train': np.mean([record['spec_train'] for record in fold_results]),
        'auc_train': np.mean([record['auc_train'] for record in fold_results])
    }

    roc = []
    roc_train = []

    for record in fold_results:
        fprs, tprs, thresholds = simplify_roc(*roc_curve(record['y_test'], record['y_pred']))
        roc.append((fprs, tprs))
        fprs_train, tprs_train, thresholds_train = simplify_roc(*roc_curve(record['y_train'], record['y_pred_train']))
        roc_train.append((fprs_train, tprs_train))

    fprs, tprs = average_n_roc_curves(roc)
    fprs_train, tprs_train = average_n_roc_curves(roc_train)

    avg_n_nodes = np.mean([len(curve[0]) for curve in roc])
    avg_n_nodes_train = np.mean([len(curve[0]) for curve in roc_train])
    n_nodes = len(fprs)
    n_nodes_train = len(tprs)

    results = results | {
        'n_nodes': n_nodes, 
        'n_nodes_train': n_nodes_train,
        'avg_n_nodes': avg_n_nodes,
        'avg_n_nodes_train': avg_n_nodes_train,
        'roc_fprs': str(fprs.tolist()),
        'roc_tprs': str(tprs.tolist()),
        'roc_fprs_train': str(fprs_train.tolist()),
        'roc_tprs_train': str(tprs_train.tolist())
    }

    return results


In [89]:
N_SAMPLES = 10_000

results = []
random_state = np.random.RandomState(5)
dropped = 0

while len(results) < N_SAMPLES:
    result = {}

    loader = random_state.choice(datasets)
    dataset = loader()

    X = dataset['data']
    y = dataset['target']
    name = dataset['name']

    if random_state.randint(2) == 0:
        y = 1 - y

    p_total = np.sum(y)
    n_total = len(y) - p_total

    k = random_state.randint(2, 11)
    while k > p_total:
        k = random_state.randint(2, 11)

    result = result | {'p': p_total, 'n': n_total, 'k': k, 'dataset': name}
    
    fold_results = []

    classifier = generate_random_classifier(random_state)

    result = result | {'classifier': classifier[0].__name__, 'classifier_params': str(classifier[1])}

    classifier_obj = classifier[0](**classifier[1])
    
    for train, test in StratifiedKFold(n_splits=k, shuffle=True, random_state=random_state).split(X, y):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]

        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

        classifier_obj.fit(X_train, y_train)

        y_pred = classifier_obj.predict_proba(X_test)[:, 1]
        y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

        scores = calculate_scores(y_test, y_pred, y_train, random_state)
        scores_train = calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

        auc = roc_auc_score(y_test, y_pred)
        auc_train = roc_auc_score(y_train, y_pred_train)

        fold_results.append(scores | scores_train | {'auc': auc, 'auc_train': auc_train, 
                'y_pred': y_pred, 'y_test': y_test, 
                'y_pred_train': y_pred_train, 'y_train': y_train})

    if np.mean([record['auc'] for record in fold_results]) < 0.5:
        dropped += 1
        continue

    result = result | process_results(fold_results, random_state)
    
    results.append(result)


In [90]:
dropped

0

In [91]:
data = pd.DataFrame.from_dict(results)

In [92]:
data.head()

Unnamed: 0,p,n,k,dataset,classifier,classifier_params,best_acc,best_sens,best_spec,best_acc_train,...,spec_train,auc_train,n_nodes,n_nodes_train,avg_n_nodes,avg_n_nodes_train,roc_fprs,roc_tprs,roc_fprs_train,roc_tprs_train
0,200,145,8,bupa,SVC,"{'probability': True, 'C': 0.4603054539689608,...",0.579744,1.0,0.0,0.579711,...,0.721183,0.824108,59,59,19.875,100.5,"[0.0, 0.0, 0.05555555555555555, 0.055555555555...","[0.0, 0.15000000000000002, 0.15000000000000002...","[0.0, 0.0, 0.007874015748031496, 0.00787401574...","[0.0, 0.07, 0.07, 0.09714285714285714, 0.09714..."
1,145,200,2,bupa,RandomForestClassifier,"{'max_depth': 4, 'random_state': 5}",0.579715,0.0,1.0,0.579715,...,0.915,0.977109,117,117,72.5,22.0,"[0.0, 0.0, 0.01, 0.01, 0.02, 0.02, 0.03, 0.03,...","[0.0, 0.0763888888888889, 0.0763888888888889, ...","[0.0, 0.0, 0.01, 0.01, 0.02, 0.02, 0.03, 0.03,...","[0.0, 0.6213850837138508, 0.6213850837138508, ..."
2,126,225,8,ionosphere,DecisionTreeClassifier,"{'max_depth': 5, 'random_state': 5}",0.871829,0.785937,0.919643,0.96337,...,0.998731,0.97076,12,12,4.25,5.125,"[0.0, 0.0, 0.03571428571428571, 0.035714285714...","[0.0, 0.10833333333333334, 0.4628348214285714,...","[0.0, 0.0, 0.005076142131979695, 0.05583756345...","[0.0, 0.8979524979524979, 0.9133378988391678, ..."
3,145,200,2,bupa,SVC,"{'probability': True, 'C': 0.08889747830955769...",0.579715,0.0,1.0,0.579715,...,0.75,0.846906,122,122,78.0,54.0,"[0.0, 0.0, 0.01, 0.01, 0.02, 0.02, 0.04, 0.04,...","[0.0, 0.1036910197869102, 0.1036910197869102, ...","[0.0, 0.0, 0.01, 0.01, 0.02, 0.02, 0.03, 0.03,...","[0.0, 0.08285768645357686, 0.08285768645357686..."
4,429,1055,8,yeast1,SVC,"{'probability': True, 'C': 0.2553153590001828,...",0.71092,0.0,1.0,0.710917,...,0.850236,0.813447,247,247,61.0,384.25,"[0.0, 0.0, 0.007575757575757576, 0.00757575757...","[0.0, 0.039526554856743536, 0.0395265548567435...","[0.0, 0.0, 0.0010822510822510823, 0.0010822510...","[0.0, 0.02564184397163121, 0.02564184397163121..."


In [93]:
data.iloc[0]

p                                                                  200
n                                                                  145
k                                                                    8
dataset                                                           bupa
classifier                                                         SVC
classifier_params    {'probability': True, 'C': 0.4603054539689608,...
best_acc                                                      0.579744
best_sens                                                          1.0
best_spec                                                          0.0
best_acc_train                                                0.579711
best_sens_train                                                    1.0
best_spec_train                                                    0.0
acc                                                           0.684131
sens                                                             0.705
spec  

In [94]:
data.to_csv('raw-aggregated4.csv')