In [7]:
import common_datasets.binary_classification as binclas
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc

from common_datasets.binary_classification import summary_pdf

import numpy as np
import pandas as pd

import tqdm

from config import generate_random_classifier, datasets

In [8]:
#N_SAMPLES = 20_000
N_SAMPLES = 2400
output_file = 'raw-single-20k-b.csv'

In [9]:
len(datasets)

3

In [10]:
names = [dataset()['name'] for dataset in datasets]

In [11]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [12]:
tmp.index = [idx for idx in range(1, len(tmp)+1)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & hypothyroid \cite{krnn} & 3163 & 25 & 151 & 3012 & 19.95 \\
2 & KC1 \cite{krnn} & 2109 & 21 & 326 & 1783 & 5.47 \\
3 & segment0 \cite{keel} & 2308 & 19 & 329 & 1979 & 6.02 \\
\bottomrule
\end{tabular}



In [13]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [14]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [15]:
random_state = np.random.RandomState(5)
dropped = 0
results = []

with tqdm.tqdm(total=N_SAMPLES) as bar:
    while len(results) < N_SAMPLES:
        record = {}

        loader = random_state.choice(datasets)
        dataset = loader()

        X = dataset['data']
        y = dataset['target']
        name = dataset['name']

        record['dataset'] = name

        if random_state.randint(2) == 0:
            y = 1 - y

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

        classifier = generate_random_classifier(random_state, p=np.sum(y_train), n=np.sum(1 - y_train))

        bar.set_description("%25s %20s" % (classifier[0].__name__, dataset['name']))
        bar.refresh()

        record['classifier'] = classifier[0].__name__
        record['classifier_params'] = str(classifier[1])

        classifier_obj = classifier[0](**classifier[1])

        classifier_obj.fit(X_train, y_train)

        if classifier[0].__name__ == 'SVC' and classifier_obj.fit_status_ == 1:
            print('failed SVC')
            continue

        record = record | {
            'p': np.sum(y_test), 
            'n': len(y_test) - np.sum(y_test), 
            'p_train': np.sum(y_train), 
            'n_train': len(y_train) - np.sum(y_train)
        }

        y_pred = classifier_obj.predict_proba(X_test)[:, 1]
        y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

        auc = roc_auc_score(y_test, y_pred)
        auc_train = roc_auc_score(y_train, y_pred_train)

        record = record | {'auc': auc, 'auc_train': auc_train}

        fpr, tpr, thresholds = simplify_roc(*roc_curve(y_test, y_pred))
        fpr_train, tpr_train, thresholds_train = simplify_roc(*roc_curve(y_train, y_pred_train))

        record = record | {
            'fprs': str(fpr.tolist()), 
            'tprs': str(tpr.tolist()),
            'thresholds': str(thresholds.tolist()),
            'n_nodes': len(fpr),
            'fprs_train': str(fpr_train.tolist()), 
            'tprs_train': str(tpr_train.tolist()),
            'thresholds_train': str(thresholds_train.tolist()),
            'n_nodes_train': len(fpr_train)
            }

        if auc < 0.5:
            dropped += 1
            continue

        record = record | calculate_scores(y_test, y_pred, y_train, random_state)
        record = record | calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

        results.append(record)
        bar.update(1)


   RandomForestClassifier             segment0: 100%|██████████| 2400/2400 [12:42<00:00,  3.15it/s]


In [16]:
dropped

0

In [17]:
data = pd.DataFrame.from_dict(results)

In [18]:
data.iloc[1]

dataset                                                            KC1
classifier                                        KNeighborsClassifier
classifier_params                                 {'n_neighbors': 484}
p                                                                   65
n                                                                  357
p_train                                                            261
n_train                                                           1426
auc                                                           0.775738
auc_train                                                     0.799978
fprs                 [0.0, 0.0, 0.011204481792717087, 0.01680672268...
tprs                 [0.0, 0.046153846153846156, 0.0461538461538461...
thresholds           [inf, 0.3574380165289256, 0.34710743801652894,...
n_nodes                                                             80
fprs_train           [0.0, 0.0, 0.0021037868162692847, 0.0035063113...
tprs_t

In [19]:
data.head()

Unnamed: 0,dataset,classifier,classifier_params,p,n,p_train,n_train,auc,auc_train,fprs,...,spec,best_acc,best_sens,best_spec,acc_train,sens_train,spec_train,best_acc_train,best_sens_train,best_spec_train
0,segment0,SVC,"{'probability': True, 'C': 0.16616016339367984...",66,396,263,1583,0.999847,0.999536,"[0.0, 0.0, 0.010101010101010102, 0.01010101010...",...,0.994949,0.997835,0.984848,1.0,0.991874,0.984791,0.993051,0.995125,0.969582,0.999368
1,KC1,KNeighborsClassifier,{'n_neighbors': 484},65,357,261,1426,0.775738,0.799978,"[0.0, 0.0, 0.011204481792717087, 0.01680672268...",...,0.736695,0.853081,0.046154,1.0,0.73029,0.685824,0.738429,0.857143,0.206897,0.976157
2,segment0,RandomForestClassifier,"{'max_depth': 1, 'random_state': 5}",396,66,1583,263,0.982266,0.987365,"[0.0, 0.0, 0.015151515151515152, 0.01515151515...",...,1.0,0.980519,0.989899,0.924242,0.578007,0.507896,1.0,0.985915,0.993051,0.942966
3,hypothyroid,XGBClassifier,"{'random_state': 5, 'max_depth': 5}",30,603,121,2409,0.995412,0.999995,"[0.0, 0.0, 0.001658374792703151, 0.00165837479...",...,0.991708,0.995261,0.966667,0.996683,0.992885,1.0,0.992528,0.999605,1.0,0.999585
4,KC1,KNeighborsClassifier,{'n_neighbors': 8},357,65,1426,261,0.75251,0.880647,"[0.0, 0.16923076923076924, 0.4461538461538462,...",...,0.553846,0.85782,0.997199,0.092308,0.815056,0.820477,0.785441,0.866034,0.950912,0.402299


In [20]:
data.to_csv(output_file, index=False)