In [59]:
import common_datasets.binary_classification as binclas
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from mlscorecheck.auc import simplify_roc

from common_datasets.binary_classification import summary_pdf

import numpy as np
import pandas as pd

import tqdm

from config import generate_random_classifier, datasets

In [60]:
N_SAMPLES = 50_000
#N_SAMPLES = 2400
output_file = 'raw-single-50k-rs5-syn.csv'

In [61]:
len(datasets)

31

In [62]:
names = [dataset()['name'] for dataset in datasets]

In [63]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [64]:
tmp.index = [idx for idx in range(1, len(tmp)+1)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & abalone9 18 \cite{keel} & 731 & 9 & 42 & 689 & 16.40 \\
2 & appendicitis \cite{keel} & 106 & 7 & 21 & 85 & 4.05 \\
3 & australian \cite{keel} & 690 & 16 & 307 & 383 & 1.25 \\
4 & bupa \cite{keel} & 345 & 6 & 145 & 200 & 1.38 \\
5 & CM1 \cite{krnn} & 498 & 21 & 49 & 449 & 9.16 \\
6 & crx \cite{keel} & 653 & 37 & 296 & 357 & 1.21 \\
7 & dermatology-6 \cite{keel} & 358 & 34 & 20 & 338 & 16.90 \\
8 & ecoli1 \cite{keel} & 336 & 7 & 77 & 259 & 3.36 \\
9 & glass0 \cite{keel} & 214 & 9 & 70 & 144 & 2.06 \\
10 & haberman \cite{keel} & 306 & 3 & 81 & 225 & 2.78 \\
11 & hepatitis \cite{krnn} & 155 & 19 & 32 & 123 & 3.84 \\
12 & hypothyroid \cite{krnn} & 3163 & 25 & 151 & 3012 & 19.95 \\
13 & ionosphere \cite{keel} & 351 & 33 & 126 & 225 & 1.79 \\
14 & iris0 \cite{keel} & 150 & 4 & 50 & 100 & 2.00 \\
15 & KC1 \cite{krnn} & 2109 & 21 & 326 & 1783 & 5.47 \\
16 & mammographic \cite{keel} & 830 & 5 & 403 & 42

In [65]:
def acc_sens_spec_at_th(y_test, y_pred, th):
    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    return (tp + tn)/(p + n), tp/p, tn/n

In [66]:
def calculate_scores(y_test, y_pred, y_train, random_state, label=''):
    threshold = np.sum(y_train)/len(y_train)

    acc, sens, spec = acc_sens_spec_at_th(y_test, y_pred, threshold)

    best_ths = []
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        acc_tmp, _, _ = acc_sens_spec_at_th(y_test, y_pred, th)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_ths = [th]
        elif acc_tmp == best_acc:
            best_ths.append(th)

    best_th = random_state.choice(best_ths)

    best_acc, best_sens, best_spec = acc_sens_spec_at_th(y_test, y_pred, best_th)

    return {
        f'acc{label}': acc,
        f'sens{label}': sens,
        f'spec{label}': spec,
        f'best_acc{label}': best_acc,
        f'best_sens{label}': best_sens,
        f'best_spec{label}': best_spec
    }

In [67]:
import sklearn.datasets
def synthetic_dataset(random_state):
    n_features = random_state.randint(2, 20)
    n_informative = random_state.randint(1, n_features+1)
    if n_informative < n_features:
        n_redundant = random_state.randint(1, n_features - n_informative + 1)
    else:
        n_redundant = 0

    n_clusters_per_class = random_state.randint(1, 2**(n_informative)/2 + 1)
    weights = random_state.random_sample() * 0.8 + 0.1

    X, y = sklearn.datasets.make_classification(
        n_samples=random_state.randint(100, 2000), 
        n_features=n_features,
        n_informative=n_informative, 
        n_redundant=n_redundant, 
        n_repeated=0, 
        n_classes=2, 
        n_clusters_per_class=n_clusters_per_class, 
        weights=(weights, 1 - weights), 
        flip_y=0.01, 
        class_sep=1.0, 
        hypercube=True, 
        shift=0.0, 
        scale=1.0, 
        shuffle=True, 
        random_state=random_state
        )
    
    return {'data': X, 'target': y, 'name': 'synthetic'}


In [68]:
random_state = np.random.RandomState(5)
dropped = 0
results = []

with tqdm.tqdm(total=N_SAMPLES) as bar:
    while len(results) < N_SAMPLES:
        record = {}

        loader = random_state.choice(datasets)
        #dataset = loader()
        dataset = synthetic_dataset(random_state)

        X = dataset['data']
        y = dataset['target']
        name = dataset['name']

        record['dataset'] = name

        if random_state.randint(2) == 0:
            y = 1 - y

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

        classifier = generate_random_classifier(random_state, p=np.sum(y_train), n=np.sum(1 - y_train))

        bar.set_description("%25s %20s" % (classifier[0].__name__, dataset['name']))
        bar.refresh()

        record['classifier'] = classifier[0].__name__
        record['classifier_params'] = str(classifier[1])

        classifier_obj = classifier[0](**classifier[1])

        classifier_obj.fit(X_train, y_train)

        if classifier[0].__name__ == 'SVC' and classifier_obj.fit_status_ == 1:
            print('failed SVC')
            continue

        record = record | {
            'p': np.sum(y_test), 
            'n': len(y_test) - np.sum(y_test), 
            'p_train': np.sum(y_train), 
            'n_train': len(y_train) - np.sum(y_train)
        }

        y_pred = classifier_obj.predict_proba(X_test)[:, 1]
        y_pred_train = classifier_obj.predict_proba(X_train)[:, 1]

        auc = roc_auc_score(y_test, y_pred)
        auc_train = roc_auc_score(y_train, y_pred_train)

        record = record | {'auc': auc, 'auc_train': auc_train}

        fpr, tpr, thresholds = simplify_roc(*roc_curve(y_test, y_pred))
        fpr_train, tpr_train, thresholds_train = simplify_roc(*roc_curve(y_train, y_pred_train))

        record = record | {
            'fprs': str(fpr.tolist()), 
            'tprs': str(tpr.tolist()),
            'thresholds': str(thresholds.tolist()),
            'n_nodes': len(fpr),
            'fprs_train': str(fpr_train.tolist()), 
            'tprs_train': str(tpr_train.tolist()),
            'thresholds_train': str(thresholds_train.tolist()),
            'n_nodes_train': len(fpr_train)
            }

        if auc < 0.5:
            dropped += 1
            continue

        record = record | calculate_scores(y_test, y_pred, y_train, random_state)
        record = record | calculate_scores(y_train, y_pred_train, y_train, random_state, '_train')

        results.append(record)
        bar.update(1)


   DecisionTreeClassifier            synthetic: 100%|██████████| 50000/50000 [1:40:34<00:00,  8.29it/s]  


In [69]:
dropped

6946

In [70]:
data = pd.DataFrame.from_dict(results)

In [71]:
data.iloc[1]

dataset                                        synthetic
classifier                        DecisionTreeClassifier
classifier_params    {'max_depth': 1, 'random_state': 5}
p                                                     55
n                                                     14
p_train                                              217
n_train                                               57
auc                                                  1.0
auc_train                                            1.0
fprs                                     [0.0, 0.0, 1.0]
tprs                                     [0.0, 1.0, 1.0]
thresholds                               [inf, 1.0, 0.0]
n_nodes                                                3
fprs_train                               [0.0, 0.0, 1.0]
tprs_train                               [0.0, 1.0, 1.0]
thresholds_train                         [inf, 1.0, 0.0]
n_nodes_train                                          3
acc                            

In [72]:
data.head()

Unnamed: 0,dataset,classifier,classifier_params,p,n,p_train,n_train,auc,auc_train,fprs,...,spec,best_acc,best_sens,best_spec,acc_train,sens_train,spec_train,best_acc_train,best_sens_train,best_spec_train
0,synthetic,DecisionTreeClassifier,"{'max_depth': 6, 'random_state': 5}",152,153,610,609,0.513867,0.773231,"[0.0, 0.0457516339869281, 0.0718954248366013, ...",...,0.411765,0.521311,0.197368,0.843137,0.694011,0.777049,0.610837,0.694011,0.781967,0.605911
1,synthetic,DecisionTreeClassifier,"{'max_depth': 1, 'random_state': 5}",55,14,217,57,1.0,1.0,"[0.0, 0.0, 1.0]",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,synthetic,KNeighborsClassifier,{'n_neighbors': 7},14,11,54,44,0.766234,0.768308,"[0.0, 0.09090909090909091, 0.09090909090909091...",...,0.636364,0.72,0.785714,0.636364,0.683673,0.944444,0.363636,0.714286,0.722222,0.704545
3,synthetic,XGBClassifier,"{'random_state': 5, 'max_depth': 2}",182,46,727,184,0.688605,0.954279,"[0.0, 0.0, 0.021739130434782608, 0.02173913043...",...,0.586957,0.811404,0.978022,0.152174,0.834248,0.811554,0.923913,0.919868,0.977992,0.690217
4,synthetic,RandomForestClassifier,"{'max_depth': 7, 'random_state': 5}",316,84,1263,335,0.809637,0.97173,"[0.0, 0.0, 0.011904761904761904, 0.01190476190...",...,0.690476,0.8525,0.96519,0.428571,0.889237,0.876485,0.937313,0.932416,0.973872,0.776119


In [73]:
data.to_csv(output_file, index=False)