In [21]:
import common_datasets.binary_classification as binclas
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

In [22]:
def generate_random_classifier(random_state):
    mode = random_state.randint(4)
    if mode == 0:
        classifier = RandomForestClassifier
        params = {'max_depth': random_state.randint(3, 10),
                  'random_state': 5}
    if mode == 1:
        classifier = DecisionTreeClassifier
        params = {'max_depth': random_state.randint(3, 10),
                  'random_state': 5}
    if mode == 2:
        classifier = SVC
        params = {'probability': True, 'C': random_state.rand()*2 + 0.001}
    if mode == 3:
        classifier = KNeighborsClassifier
        params = {'n_neighbors': random_state.randint(1, 10)}
    
    return (classifier, params)

In [23]:
datasets = binclas.get_filtered_data_loaders(n_col_bounds=(0, 50), n_bounds=(0, 2000), n_minority_bounds=(20, 1000), n_from_phenotypes=1, imbalance_ratio_bounds=(0.2, 20.0))

In [24]:
len(datasets)

28

In [25]:
names = [dataset()['name'] for dataset in datasets if not dataset()['name'].startswith('led')]

In [26]:
from common_datasets.binary_classification import summary_pdf

In [27]:
tmp = summary_pdf[summary_pdf['name'].isin(names)].reset_index(drop=True)
tmp = tmp[['name', 'n_col', 'n', 'n_minority', 'imbalance_ratio', 'citation_key']]
tmp['name_key'] = tmp.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
tmp = tmp[['name_key', 'n', 'n_col', 'n_minority', 'imbalance_ratio']]
tmp.columns = ['name', 'size', 'attr.', 'p', 'imb. ratio']
tmp['n'] = tmp['size'] - tmp['p']
tmp = tmp[['name', 'size', 'attr.', 'p', 'n', 'imb. ratio']]

In [28]:
tmp.index = [idx for idx in range(1, 28)]
print(tmp.to_latex(float_format="%.2f").replace('_', ' '))

\begin{tabular}{llrrrrr}
\toprule
 & name & size & attr. & p & n & imb. ratio \\
\midrule
1 & abalone9 18 \cite{keel} & 731 & 9 & 42 & 689 & 16.40 \\
2 & appendicitis \cite{keel} & 106 & 7 & 21 & 85 & 4.05 \\
3 & australian \cite{keel} & 690 & 16 & 307 & 383 & 1.25 \\
4 & bupa \cite{keel} & 345 & 6 & 145 & 200 & 1.38 \\
5 & CM1 \cite{krnn} & 498 & 21 & 49 & 449 & 9.16 \\
6 & crx \cite{keel} & 653 & 37 & 296 & 357 & 1.21 \\
7 & dermatology-6 \cite{keel} & 358 & 34 & 20 & 338 & 16.90 \\
8 & ecoli1 \cite{keel} & 336 & 7 & 77 & 259 & 3.36 \\
9 & glass0 \cite{keel} & 214 & 9 & 70 & 144 & 2.06 \\
10 & haberman \cite{keel} & 306 & 3 & 81 & 225 & 2.78 \\
11 & hepatitis \cite{krnn} & 155 & 19 & 32 & 123 & 3.84 \\
12 & ionosphere \cite{keel} & 351 & 33 & 126 & 225 & 1.79 \\
13 & iris0 \cite{keel} & 150 & 4 & 50 & 100 & 2.00 \\
14 & mammographic \cite{keel} & 830 & 5 & 403 & 427 & 1.06 \\
15 & monk-2 \cite{keel} & 432 & 6 & 204 & 228 & 1.12 \\
16 & new thyroid1 \cite{keel} & 215 & 5 & 35 & 180 & 

In [29]:
results = []
random_state = np.random.RandomState(5)

for _ in range(10000):
    loader = random_state.choice(datasets)
    dataset = loader()

    X = dataset['data']
    y = dataset['target']
    name = dataset['name']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
    classifier = generate_random_classifier(random_state)

    classifier_obj = classifier[0](**classifier[1])

    classifier_obj.fit(X_train, y_train)
    y_pred = classifier_obj.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_pred)

    threshold = random_state.random()

    tp = np.sum((y_pred >= threshold) & (y_test == 1))
    tn = np.sum((y_pred < threshold) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    acc = (tp + tn) / (p + n)
    sens = tp / p
    spec = tn / n

    best_th = -1
    best_acc = 0
    for th in np.hstack([np.unique(y_pred), np.array([-np.inf, np.inf])]):
        tp = np.sum((y_pred >= th) & (y_test == 1))
        tn = np.sum((y_pred < th) & (y_test == 0))
        p = np.sum(y_test)
        n = len(y_test) - np.sum(y_test)

        acc_tmp = (tp + tn) / (p + n)

        if acc_tmp > best_acc:
            best_acc = acc_tmp
            best_th = th

    th = best_th

    tp = np.sum((y_pred >= th) & (y_test == 1))
    tn = np.sum((y_pred < th) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    best_acc = (tp + tn) / (p + n)
    best_sens = (tp) / (p)
    best_spec = (tn) / (n)

    results.append((name, acc, sens, spec, auc, best_acc, best_sens, best_spec, threshold, best_th, p, n))

KeyboardInterrupt: 

In [10]:
data = pd.DataFrame(results, columns=['dataset', 'acc', 'sens', 'spec', 'auc', 'best_acc', 'best_sens', 'best_spec', 'threshold', 'best_threshold', 'p', 'n'])

In [11]:
data.to_csv('raw-single.csv', index=False)