In [27]:
from mlscorecheck.auc import acc_from, auc_from
import common_datasets.binary_classification as binclas
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

In [28]:
classifiers = [
    (RandomForestClassifier, {'max_depth': 5, 'random_state': 5}),
    (DecisionTreeClassifier, {'max_depth': 5, 'random_state': 5}),
    (SVC, {'probability': True, 'C': 0.1}),
    (KNeighborsClassifier, {'n_neighbors': 11})
]

In [29]:
datasets = binclas.get_filtered_data_loaders(n_col_bounds=(0, 50), n_bounds=(0, 1000), n_minority_bounds=(20, 1000), n_from_phenotypes=1, imbalance_ratio_bounds=(0.5, 2.0))

In [30]:
results = []

for loader in datasets:
    dataset = loader()
    X = dataset['data']
    y = dataset['target']
    name = dataset['name']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
    classifier = classifiers[np.random.randint(len(classifiers))]

    classifier_obj = classifier[0](**classifier[1])

    classifier_obj.fit(X_train, y_train)
    y_pred = classifier_obj.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_pred)

    threshold = np.random.random()

    tp = np.sum((y_pred >= threshold) & (y_test == 1))
    tn = np.sum((y_pred < threshold) & (y_test == 0))
    p = np.sum(y_test)
    n = len(y_test) - np.sum(y_test)

    acc = np.round((tp + tn) / (p + n), 4)
    sens = np.round((tp) / (p), 4)
    spec = np.round((tn) / (n), 4)

    scores = {
        'acc': acc,
        'sens': sens,
        'spec': spec
    }

    auc0_int = auc_from(scores=scores, eps=1e-4, p=p, n=n, lower='min', upper='max')
    auc0 = (auc0_int[0] + auc0_int[1]) / 2.0

    auc1_int = auc_from(scores=scores, eps=1e-4, p=p, n=n, lower='cmin', upper='max')
    auc1 = (auc1_int[0] + auc1_int[1]) / 2.0


    results.append((auc, acc, sens, spec, threshold, auc0_int, auc0, auc1_int, auc1))

In [32]:
data = pd.DataFrame(results, columns=['auc', 'acc', 'sens', 'spec', 'threshold', 'auc0_int', 'auc0', 'auc1_int', 'auc1'])

In [34]:
np.mean(np.abs((data['auc'] - data['auc0']))), np.mean(np.abs((data['auc'] - data['auc1']))),

(0.12551182373807993, 0.05224014298541613)

In [31]:
auc_from(
    scores={'sens': 0.95, 'spec': 0.95},
    eps=1e-4,
    p=500,
    n=1000,
    lower='cmin',
    upper='max-acc'
)

(0.9049999999999999, 0.9943974775)