In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc, average_n_roc_curves

import numpy as np
import pandas as pd

import tqdm

from config import generate_random_classifier, datasets

In [10]:
N_SAMPLES = 20_000
N_SAMPLES = 2_400
output_file = 'raw-aggregated-20k-b.csv'

In [11]:
len(datasets)

3

In [12]:
datasets

[<function common_datasets.binary_classification._binary_classification_part1.load_hypothyroid()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_kc1()>,
 <function common_datasets.binary_classification._binary_classification_part1.load_segment0()>]

In [13]:
names = [dataset()['name'] for dataset in datasets]

In [14]:
from common_datasets.binary_classification import summary_pdf

In [15]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [16]:
tmp.index = [idx for idx in range(1, len(tmp)+1)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & hypothyroid \cite{krnn} & 3163 & 25 & 151 & 3012 & 19.95 \\
2 & KC1 \cite{krnn} & 2109 & 21 & 326 & 1783 & 5.47 \\
3 & segment0 \cite{keel} & 2308 & 19 & 329 & 1979 & 6.02 \\
\bottomrule
\end{tabular}



In [17]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [18]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'threshold{label}': threshold,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec,
        f'best_threshold{label}': best_th
    }

In [19]:
def best_aggregated_scores(y_trues, y_preds, random_state, label=''):
    thresholds = np.unique(np.hstack(y_preds))

    best_ths = []
    best_acc = 0
    for th in thresholds.tolist() + [np.inf, -np.inf]:
        acc_tmp = np.mean([acc_sens_spec_at_th(y_true, y_pred, th)[0] for y_true, y_pred in zip(y_trues, y_preds)])

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    scores = [acc_sens_spec_at_th(y_true, y_pred, best_th) for y_true, y_pred in zip(y_trues, y_preds)]

    best_acc = np.mean([item[0] for item in scores])
    best_sens = np.mean([item[1] for item in scores])
    best_spec = np.mean([item[2] for item in scores])

    return {
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [20]:
def process_results(fold_results, random_state):
    results = {}

    results = results | best_aggregated_scores(
        [record['y_test'] for record in fold_results],
        [record['y_pred'] for record in fold_results],
        random_state
    )

    results = results | best_aggregated_scores(
        [record['y_train'] for record in fold_results],
        [record['y_pred_train'] for record in fold_results],
        random_state,
        label='_train'
    )

    results = results | {
        'acc': np.mean([record['acc'] for record in fold_results]),
        'sens': np.mean([record['sens'] for record in fold_results]),
        'spec': np.mean([record['spec'] for record in fold_results]),
        'auc': np.mean([record['auc'] for record in fold_results]),
        'acc_train': np.mean([record['acc_train'] for record in fold_results]),
        'sens_train': np.mean([record['sens_train'] for record in fold_results]),
        'spec_train': np.mean([record['spec_train'] for record in fold_results]),
        'auc_train': np.mean([record['auc_train'] for record in fold_results])
    }

    roc = []
    roc_train = []

    for record in fold_results:
        fprs, tprs, thresholds = simplify_roc(*roc_curve(record['y_test'], record['y_pred']))
        roc.append((fprs, tprs))
        fprs_train, tprs_train, thresholds_train = simplify_roc(*roc_curve(record['y_train'], record['y_pred_train']))
        roc_train.append((fprs_train, tprs_train))

    fprs, tprs = average_n_roc_curves(roc, random_state)
    fprs_train, tprs_train = average_n_roc_curves(roc_train, random_state)

    avg_n_nodes = np.mean([len(curve[0]) for curve in roc])
    avg_n_nodes_train = np.mean([len(curve[0]) for curve in roc_train])
    n_nodes = len(fprs)
    n_nodes_train = len(fprs_train)

    results = results | {
        'n_nodes': n_nodes, 
        'n_nodes_train': n_nodes_train,
        'avg_n_nodes': avg_n_nodes,
        'avg_n_nodes_train': avg_n_nodes_train,
        'fprs': str(fprs.tolist()),
        'tprs': str(tprs.tolist()),
        'fprs_train': str(fprs_train.tolist()),
        'tprs_train': str(tprs_train.tolist())
    }

    return results


In [21]:
results = []
random_state = np.random.RandomState(5)
dropped = 0

with tqdm.tqdm(total=N_SAMPLES) as bar:
    while len(results) < N_SAMPLES:
        result = {}

        loader = random_state.choice(datasets)
        dataset = loader()

        X = dataset['data']
        y = dataset['target']
        name = dataset['name']

        if random_state.randint(2) == 0:
            y = 1 - y

        mask = np.arange(len(y))
        random_state.shuffle(mask)

        X = X[mask]
        y = y[mask]

        p_total = np.sum(y)
        n_total = len(y) - p_total

        k = random_state.randint(2, 11)
        while k > p_total or k > n_total:
            k = random_state.randint(2, 11)

        result = result | {'p': p_total, 'n': n_total, 'k': k, 'dataset': name}
        
        fold_results = []

        classifier = generate_random_classifier(random_state, int(p_total*(1 - 1/k)), int(n_total*(1 - 1/k)))

        bar.set_description("%25s %20s" % (classifier[0].__name__, dataset['name']))
        bar.refresh()

        result = result | {'classifier': classifier[0].__name__, 'classifier_params': str(classifier[1])}

        classifier_obj = classifier[0](**classifier[1])
        
        for train, test in StratifiedKFold(n_splits=k, shuffle=True, random_state=random_state).split(X, y):
            X_train = X[train]
            X_test = X[test]
            y_train = y[train]
            y_test = y[test]

            ss = StandardScaler()
            X_train = ss.fit_transform(X_train)
            X_test = ss.transform(X_test)

            classifier_obj.fit(X_train, y_train)

            y_pred = classifier_obj.predict_proba(X_test)[:, 1]
            y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

            scores = calculate_scores(y_test, y_pred, y_train, random_state)
            scores_train = calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

            auc = roc_auc_score(y_test, y_pred)
            auc_train = roc_auc_score(y_train, y_pred_train)

            fold_results.append(scores | scores_train | {'auc': auc, 'auc_train': auc_train, 
                    'y_pred': y_pred, 'y_test': y_test, 
                    'y_pred_train': y_pred_train, 'y_train': y_train})

        if np.mean([record['auc'] for record in fold_results]) < 0.5:
            dropped += 1
            continue

        result = result | process_results(fold_results, random_state)
        
        results.append(result)
        bar.update(1)

   RandomForestClassifier                  KC1: 100%|██████████| 2400/2400 [1:13:58<00:00,  1.85s/it]


In [22]:
dropped

0

In [23]:
data = pd.DataFrame.from_dict(results)

In [24]:
data.head()

Unnamed: 0,p,n,k,dataset,classifier,classifier_params,best_acc,best_sens,best_spec,best_acc_train,...,spec_train,auc_train,n_nodes,n_nodes_train,avg_n_nodes,avg_n_nodes_train,fprs,tprs,fprs_train,tprs_train
0,329,1979,8,segment0,RandomForestClassifier,"{'max_depth': 302, 'random_state': 5}",0.998698,0.990854,1.0,1.0,...,0.998556,1.0,5,3,3.75,3.0,"[0.0, 0.0, 0.0030364372469635637, 0.0030364372...","[0.0, 0.975609756097561, 0.975609756097561, 1....","[0.0, 0.0, 1.0]","[0.0, 1.0, 1.0]"
1,3012,151,6,hypothyroid,KNeighborsClassifier,{'n_neighbors': 81},0.980398,0.994024,0.708718,0.980651,...,0.950974,0.983162,41,92,15.166667,31.0,"[0.0, 0.006474543707973068, 0.0075438596491227...","[0.0, 0.6713147410358565, 0.6912350597609562, ...","[0.0, 0.0, 0.0001165117745428601, 0.0002770955...","[0.0, 0.6804780876494024, 0.6872509960159362, ..."
2,1979,329,9,segment0,XGBClassifier,"{'random_state': 5, 'max_depth': 4}",0.997837,0.99899,0.990908,1.0,...,1.0,1.0,7,3,3.888889,3.0,"[0.0, 0.0, 0.02702702702702703, 0.027027027027...","[0.0, 0.9474747474747475, 0.9474747474747475, ...","[0.0, 0.0, 1.0]","[0.0, 1.0, 1.0]"
3,151,3012,2,hypothyroid,DecisionTreeClassifier,"{'max_depth': 474, 'random_state': 5}",0.987038,0.861228,0.99336,0.999684,...,0.999668,0.999998,4,4,3.5,3.5,"[0.0, 0.006640106241699867, 0.0079681274900398...","[0.0, 0.8612280701754386, 0.8613512055539919, ...","[0.0, 0.0, 0.0006640106241699867, 1.0]","[0.0, 0.993421052631579, 1.0, 1.0]"
4,1783,326,7,KC1,XGBClassifier,"{'random_state': 5, 'max_depth': 4}",0.863426,0.965795,0.303225,0.9789,...,0.949898,0.983133,174,181,63.571429,60.571429,"[0.0, 0.0, 0.02127659574468085, 0.021276595744...","[0.0, 0.27932684884977615, 0.2837936246705082,...","[0.0, 0.0, 0.0035714285714285713, 0.0035714285...","[0.0, 0.3798920814392207, 0.40171949684907193,..."


In [25]:
data.iloc[0]

p                                                                  329
n                                                                 1979
k                                                                    8
dataset                                                       segment0
classifier                                      RandomForestClassifier
classifier_params                {'max_depth': 302, 'random_state': 5}
best_acc                                                      0.998698
best_sens                                                     0.990854
best_spec                                                          1.0
best_acc_train                                                     1.0
best_sens_train                                                    1.0
best_spec_train                                                    1.0
acc                                                           0.991768
sens                                                          0.996951
spec  

In [26]:
data.to_csv(output_file, index=False)