In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc, average_n_roc_curves

import numpy as np
import pandas as pd

import tqdm

from config import generate_random_classifier, datasets

In [24]:
N_SAMPLES = 50_000
#N_SAMPLES = 2_400
output_file = 'raw-aggregated-50k.csv'

In [25]:
len(datasets)

31

In [26]:
datasets

[<function common_datasets.binary_classification._binary_classification_part0.load_abalone9_18()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_appendicitis()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_australian()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_bupa()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_cm1()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_crx()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_dermatology_6()>,
 <function common_datasets.binary_classification._binary_classification_part0.load_ecoli1()>,
 <function common_datasets.binary_classification._binary_classification_part0.load_glass0()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_haberman()>,
 <function common_datasets.binary_classifica

In [27]:
names = [dataset()['name'] for dataset in datasets]

In [28]:
from common_datasets.binary_classification import summary_pdf

In [29]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [30]:
tmp.index = [idx for idx in range(1, len(tmp)+1)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & abalone9 18 \cite{keel} & 731 & 9 & 42 & 689 & 16.40 \\
2 & appendicitis \cite{keel} & 106 & 7 & 21 & 85 & 4.05 \\
3 & australian \cite{keel} & 690 & 16 & 307 & 383 & 1.25 \\
4 & bupa \cite{keel} & 345 & 6 & 145 & 200 & 1.38 \\
5 & CM1 \cite{krnn} & 498 & 21 & 49 & 449 & 9.16 \\
6 & crx \cite{keel} & 653 & 37 & 296 & 357 & 1.21 \\
7 & dermatology-6 \cite{keel} & 358 & 34 & 20 & 338 & 16.90 \\
8 & ecoli1 \cite{keel} & 336 & 7 & 77 & 259 & 3.36 \\
9 & glass0 \cite{keel} & 214 & 9 & 70 & 144 & 2.06 \\
10 & haberman \cite{keel} & 306 & 3 & 81 & 225 & 2.78 \\
11 & hepatitis \cite{krnn} & 155 & 19 & 32 & 123 & 3.84 \\
12 & hypothyroid \cite{krnn} & 3163 & 25 & 151 & 3012 & 19.95 \\
13 & ionosphere \cite{keel} & 351 & 33 & 126 & 225 & 1.79 \\
14 & iris0 \cite{keel} & 150 & 4 & 50 & 100 & 2.00 \\
15 & KC1 \cite{krnn} & 2109 & 21 & 326 & 1783 & 5.47 \\
16 & mammographic \cite{keel} & 830 & 5 & 403 & 42

In [31]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [32]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'threshold{label}': threshold,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec,
        f'best_threshold{label}': best_th
    }

In [33]:
def best_aggregated_scores(y_trues, y_preds, random_state, label=''):
    thresholds = np.unique(np.hstack(y_preds))

    best_ths = []
    best_acc = 0
    for th in thresholds.tolist() + [np.inf, -np.inf]:
        acc_tmp = np.mean([acc_sens_spec_at_th(y_true, y_pred, th)[0] for y_true, y_pred in zip(y_trues, y_preds)])

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    scores = [acc_sens_spec_at_th(y_true, y_pred, best_th) for y_true, y_pred in zip(y_trues, y_preds)]

    best_acc = np.mean([item[0] for item in scores])
    best_sens = np.mean([item[1] for item in scores])
    best_spec = np.mean([item[2] for item in scores])

    return {
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [34]:
def process_results(fold_results, random_state):
    results = {}

    results = results | best_aggregated_scores(
        [record['y_test'] for record in fold_results],
        [record['y_pred'] for record in fold_results],
        random_state
    )

    results = results | best_aggregated_scores(
        [record['y_train'] for record in fold_results],
        [record['y_pred_train'] for record in fold_results],
        random_state,
        label='_train'
    )

    results = results | {
        'acc': np.mean([record['acc'] for record in fold_results]),
        'sens': np.mean([record['sens'] for record in fold_results]),
        'spec': np.mean([record['spec'] for record in fold_results]),
        'auc': np.mean([record['auc'] for record in fold_results]),
        'acc_train': np.mean([record['acc_train'] for record in fold_results]),
        'sens_train': np.mean([record['sens_train'] for record in fold_results]),
        'spec_train': np.mean([record['spec_train'] for record in fold_results]),
        'auc_train': np.mean([record['auc_train'] for record in fold_results])
    }

    roc = []
    roc_train = []

    for record in fold_results:
        fprs, tprs, thresholds = simplify_roc(*roc_curve(record['y_test'], record['y_pred']))
        roc.append((fprs, tprs))
        fprs_train, tprs_train, thresholds_train = simplify_roc(*roc_curve(record['y_train'], record['y_pred_train']))
        roc_train.append((fprs_train, tprs_train))

    fprs, tprs = average_n_roc_curves(roc, random_state)
    fprs_train, tprs_train = average_n_roc_curves(roc_train, random_state)

    avg_n_nodes = np.mean([len(curve[0]) for curve in roc])
    avg_n_nodes_train = np.mean([len(curve[0]) for curve in roc_train])
    n_nodes = len(fprs)
    n_nodes_train = len(fprs_train)

    results = results | {
        'n_nodes': n_nodes, 
        'n_nodes_train': n_nodes_train,
        'avg_n_nodes': avg_n_nodes,
        'avg_n_nodes_train': avg_n_nodes_train,
        'fprs': str(fprs.tolist()),
        'tprs': str(tprs.tolist()),
        'fprs_train': str(fprs_train.tolist()),
        'tprs_train': str(tprs_train.tolist())
    }

    return results


In [35]:
results = []
random_state = np.random.RandomState(5)
dropped = 0

with tqdm.tqdm(total=N_SAMPLES) as bar:
    while len(results) < N_SAMPLES:
        result = {}

        loader = random_state.choice(datasets)
        dataset = loader()

        X = dataset['data']
        y = dataset['target']
        name = dataset['name']

        if random_state.randint(2) == 0:
            y = 1 - y

        mask = np.arange(len(y))
        random_state.shuffle(mask)

        X = X[mask]
        y = y[mask]

        p_total = np.sum(y)
        n_total = len(y) - p_total

        k = random_state.randint(2, 11)
        while k > p_total or k > n_total:
            k = random_state.randint(2, 11)

        result = result | {'p': p_total, 'n': n_total, 'k': k, 'dataset': name}
        
        fold_results = []

        classifier = generate_random_classifier(random_state, int(p_total*(1 - 1/k)), int(n_total*(1 - 1/k)))

        bar.set_description("%25s %20s" % (classifier[0].__name__, dataset['name']))
        bar.refresh()

        result = result | {'classifier': classifier[0].__name__, 'classifier_params': str(classifier[1])}

        classifier_obj = classifier[0](**classifier[1])
        
        for train, test in StratifiedKFold(n_splits=k, shuffle=True, random_state=random_state).split(X, y):
            X_train = X[train]
            X_test = X[test]
            y_train = y[train]
            y_test = y[test]

            ss = StandardScaler()
            X_train = ss.fit_transform(X_train)
            X_test = ss.transform(X_test)

            classifier_obj.fit(X_train, y_train)

            y_pred = classifier_obj.predict_proba(X_test)[:, 1]
            y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

            scores = calculate_scores(y_test, y_pred, y_train, random_state)
            scores_train = calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

            auc = roc_auc_score(y_test, y_pred)
            auc_train = roc_auc_score(y_train, y_pred_train)

            fold_results.append(scores | scores_train | {'auc': auc, 'auc_train': auc_train, 
                    'y_pred': y_pred, 'y_test': y_test, 
                    'y_pred_train': y_pred_train, 'y_train': y_train})

        if np.mean([record['auc'] for record in fold_results]) < 0.5:
            dropped += 1
            continue

        result = result | process_results(fold_results, random_state)
        
        results.append(result)
        bar.update(1)

            XGBClassifier                 wdbc: 100%|██████████| 50000/50000 [10:46:03<00:00,  1.29it/s]   


In [36]:
dropped

8

In [37]:
data = pd.DataFrame.from_dict(results)

In [38]:
data.head()

Unnamed: 0,p,n,k,dataset,classifier,classifier_params,best_acc,best_sens,best_spec,best_acc_train,...,spec_train,auc_train,n_nodes,n_nodes_train,avg_n_nodes,avg_n_nodes_train,fprs,tprs,fprs_train,tprs_train
0,200,145,9,bupa,DecisionTreeClassifier,"{'max_depth': 76, 'random_state': 5}",0.634578,0.705753,0.537582,1.0,...,1.0,1.0,10,3,3.0,3.0,"[0.0, 0.3891389826132473, 0.4026173392062139, ...","[0.0, 0.5909090909090909, 0.6086956521739131, ...","[0.0, 0.0, 1.0]","[0.0, 1.0, 1.0]"
1,144,70,3,glass0,RandomForestClassifier,"{'max_depth': 2, 'random_state': 5}",0.831834,0.8125,0.871377,0.86211,...,0.9428,0.929068,35,49,21.666667,29.0,"[0.0, 0.0, 0.01449275362318847, 0.014492753623...","[0.0, 0.3125, 0.3125, 0.5, 0.5, 0.5625, 0.5625...","[0.0, 0.0, 0.007092198581560294, 0.00709219858...","[0.0, 0.45833333333333326, 0.45833333333333326..."
2,77,259,7,ecoli1,KNeighborsClassifier,{'n_neighbors': 70},0.877976,0.636364,0.949807,0.875992,...,0.848134,0.939335,19,75,9.571429,27.571429,"[0.0, 0.0, 0.0038610038610038533, 0.0193050193...","[0.0, 0.18181818181818188, 0.2727272727272727,...","[0.0, 0.0, 0.0003217503217503026, 0.0008273579...","[0.0, 0.10606060606060608, 0.13636363636363635..."
3,259,77,2,ecoli1,SVC,"{'probability': True, 'C': 0.19005782496750956...",0.889881,0.896064,0.870108,0.910714,...,0.896086,0.960603,53,43,28.0,23.0,"[0.0, 0.0, 0.02564102564102564, 0.025641025641...","[0.0, 0.4860763267740012, 0.4860763267740012, ...","[0.0, 0.0, 0.02564102564102564, 0.025641025641...","[0.0, 0.657304710793083, 0.657304710793083, 0...."
4,81,225,3,haberman,XGBClassifier,"{'random_state': 5, 'max_depth': 4}",0.735294,0.0,1.0,0.970588,...,0.904444,0.996626,56,16,39.0,10.666667,"[0.0, 0.022222222222222143, 0.0222222222222221...","[0.0, 0.0, 0.03703703703703698, 0.037037037037...","[0.0, 0.0, 0.0022222222222222365, 0.0022222222...","[0.0, 0.7962962962962963, 0.7962962962962963, ..."


In [39]:
data.iloc[0]

p                                                                  200
n                                                                  145
k                                                                    9
dataset                                                           bupa
classifier                                      DecisionTreeClassifier
classifier_params                 {'max_depth': 76, 'random_state': 5}
best_acc                                                      0.634578
best_sens                                                     0.705753
best_spec                                                     0.537582
best_acc_train                                                     1.0
best_sens_train                                                    1.0
best_spec_train                                                    1.0
acc                                                           0.634578
sens                                                          0.705753
spec  

In [40]:
data.to_csv(output_file, index=False)