In [1]:
! pip3 install  hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 6.0 MB/s eta 0:00:01
[?25hCollecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 31.4 MB/s eta 0:00:01
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


In [1]:
import pickle as pkl
import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
RANDOM_SEED: int = 42
np.random.seed(RANDOM_SEED)

In [51]:
# Loading train/test datasets and results df
results_df_path = 'model_cmp.csv'
df = pd.read_csv('data/creditcard.csv')
X,y = df.drop('Class', axis=1), df.Class
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=.2, stratify=y)


In [52]:
clf_baseline = ExtraTreesClassifier(n_jobs=-1).fit(X_train, y_train)
y_pred_baseline = clf_baseline.predict(X_test)
f1_baseline = f1_score(y_test, y_pred_baseline)
f1_baseline

0.8913043478260869

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [6]:
def objective_cv(params):
    f1_scores = []
    for train_index, test_index in skf.split(X_train, y_train):
        X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
        model = ExtraTreesClassifier(random_state=42, n_jobs=-1, **params).fit(X_train_cv, y_train_cv)
        score = f1_score(y_val, model.predict(X_val))
        f1_scores.append(score)
    return {'loss': -np.mean(f1_scores), 'status': STATUS_OK}

In [7]:
def objective_fast(params):
    X_train_cv, X_val, y_train_cv, y_val =  train_test_split(X_train, y_train, test_size=.2, random_state=RANDOM_SEED)
    model = ExtraTreesClassifier(random_state=42, n_jobs=-1, **params).fit(X_train_cv, y_train_cv)
    score = f1_score(y_val, model.predict(X_val))
    return {'loss': -score, 'status': STATUS_OK}

In [8]:
from hyperopt import rand

In [9]:
def run_experiment(search_space, budget:int = 10, use_cv: bool =True, method: str = 'random') -> float:
    if use_cv:
        objective = objective_cv
    else:
        objective = objective_fast
    
    if method == 'random':
        method = rand.suggest
    elif method == 'tpe':
        method = tpe.suggest
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=method,
        max_evals=budget)
    
    return space_eval(search_space, best_params), objective(space_eval(search_space, best_params))
    
    

In [18]:
search_space={
    'max_depth': hp.randint('max_depth', 1, 200),           
    'min_samples_split':hp.randint('min_samples_split', 2, 5),   
    'min_samples_leaf':hp.randint('min_samples_leaf', 1, 20),
    'criterion':hp.choice('criterion', ['gini', 'entropy']),
    'max_features':hp.choice('max_features', ['sqrt', 'log2', 0.9]),
    'bootstrap':hp.choice('bootstrap', [True, False]),
    "class_weight": hp.choice('class_weight', ['balanced', 'balanced_subsample'])
}
algorithm=tpe.suggest

In [133]:
params, _ = run_experiment(search_space, 5, use_cv=False, method='tpe')
params

100%|██████████| 5/5 [00:18<00:00,  3.68s/trial, best loss: -0.75]              


{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 191,
 'max_features': 0.9,
 'min_samples_leaf': 19,
 'min_samples_split': 3}

In [134]:
clf = ExtraTreesClassifier(**params, random_state=42, n_jobs=-1).fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred)
f1

0.8148148148148148

In [58]:
def optmimize(datasets, f1_baseline, budget_list, opt_method_list, use_cv=True, early_stopping=True):
    X_train, X_test, y_train, y_test = datasets
    final_params = {}
    final_scores = {}
    try:
        for budget in budget_list:
            for opt_method in opt_method_list:
                print((budget, opt_method))
                # tune model
                params, _ = run_experiment(search_space, budget, use_cv=use_cv, method=opt_method)
                final_params[(budget, opt_method)] = params

                # evaluate model
                clf = ExtraTreesClassifier(random_state=42, **params, n_jobs=-1).fit(X_train, y_train)
                y_pred_opt = clf.predict(X_test)
                f1 = f1_score(y_test, y_pred_opt)
                final_scores[(budget, opt_method)] = f1
                print(f"F1-score: {f1}")

                # early stopping condition, because we are looking for the smallest time budget
                if early_stopping and f1 > f1_baseline:
                    A = ((y_pred_baseline == y_test) & (y_pred_opt == y_test)).sum()
                    B = ((y_pred_baseline != y_test) & (y_pred_opt == y_test)).sum()
                    C = ((y_pred_baseline == y_test) & (y_pred_opt != y_test)).sum()
                    D = ((y_pred_baseline != y_test) & (y_pred_opt != y_test)).sum()
                    result = mcnemar([[A, B], [C, D]])
                    alpha = 0.05
                    if B + C > 20 and result.pvalue < alpha:
                        print(f"Model with time budget {budget} and {opt_method} optimization algo beat the baseline!")
                        return (budget, opt_method), final_params, final_scores
                    else:
                        print("F1 is better, but not statistically")
    except KeyboardInterrupt:
        print("Interrupted.")
        
    print("No model outperformed the baseline")
    return None, final_params, final_scores

In [29]:
final_params = {}
final_scores = {}

In [53]:
datasets = X_train, X_test, y_train, y_test
best, params, scores = optmimize(
    datasets, f1_baseline, budget_list=[10, 20, 50, 100, 200, 500, 1000, 2000, 10000], 
    opt_method_list=['tpe'], use_cv=False
)
final_params.update(params)
final_scores.update(scores)

(10, 'tpe')
100%|██████████| 10/10 [00:55<00:00,  5.55s/trial, best loss: -0.7681159420289856]
F1-score: 0.8431372549019608
(20, 'tpe')
100%|██████████| 20/20 [02:32<00:00,  7.61s/trial, best loss: -0.7999999999999999]
F1-score: 0.888888888888889
(50, 'tpe')
100%|██████████| 50/50 [06:05<00:00,  7.31s/trial, best loss: -0.819672131147541] 
F1-score: 0.88268156424581
(100, 'tpe')
 15%|█▌        | 15/100 [01:32<08:44,  6.17s/trial, best loss: -0.7851851851851852]
Interrupted.
No model outperformed the baseline


In [57]:
datasets = X_train, X_test, y_train, y_test
best, params, scores = optmimize(
    datasets, f1_baseline, budget_list=[100, 200, 500, 1000, 2000, 10000], 
    opt_method_list=['tpe'], use_cv=False
)
final_params.update(params)
final_scores.update(scores)

(100, 'tpe')
100%|██████████| 100/100 [15:33<00:00,  9.33s/trial, best loss: -0.819672131147541]
F1-score: 0.88268156424581
(200, 'tpe')
100%|██████████| 200/200 [34:48<00:00, 10.44s/trial, best loss: -0.8153846153846154]
F1-score: 0.8936170212765957
F1 is better, but not statistically
(500, 'tpe')
 45%|████▌     | 225/500 [40:08<49:04, 10.71s/trial, best loss: -0.8153846153846154]  
Interrupted.
No model outperformed the baseline


In [65]:
datasets = X_train, X_test, y_train, y_test
best, params, scores = optmimize(
    datasets, f1_baseline, budget_list=[500, 1000, 2000, 10000], 
    opt_method_list=['tpe'], use_cv=False
)
final_params.update(params)
final_scores.update(scores)

(500, 'tpe')
100%|██████████| 500/500 [1:30:26<00:00, 10.85s/trial, best loss: -0.8292682926829268]
F1-score: 0.8777777777777778
(1000, 'tpe')
100%|██████████| 1000/1000 [2:33:06<00:00,  9.19s/trial, best loss: -0.8292682926829268] 
F1-score: 0.8764044943820225
(2000, 'tpe')
 60%|█████▉    | 1198/2000 [3:18:57<2:13:11,  9.96s/trial, best loss: -0.8153846153846154]
Interrupted.
No model outperformed the baseline


In [66]:
final_params, final_scores

({(10, 'tpe'): {'bootstrap': False,
   'class_weight': 'balanced_subsample',
   'criterion': 'gini',
   'max_depth': 133,
   'max_features': 0.9,
   'min_samples_leaf': 6,
   'min_samples_split': 3},
  (20, 'tpe'): {'bootstrap': False,
   'class_weight': 'balanced_subsample',
   'criterion': 'entropy',
   'max_depth': 59,
   'max_features': 0.9,
   'min_samples_leaf': 3,
   'min_samples_split': 3},
  (50, 'tpe'): {'bootstrap': False,
   'class_weight': 'balanced_subsample',
   'criterion': 'entropy',
   'max_depth': 26,
   'max_features': 0.9,
   'min_samples_leaf': 1,
   'min_samples_split': 2},
  (100, 'tpe'): {'bootstrap': False,
   'class_weight': 'balanced_subsample',
   'criterion': 'entropy',
   'max_depth': 86,
   'max_features': 0.9,
   'min_samples_leaf': 1,
   'min_samples_split': 2},
  (200, 'tpe'): {'bootstrap': False,
   'class_weight': 'balanced',
   'criterion': 'gini',
   'max_depth': 129,
   'max_features': 0.9,
   'min_samples_leaf': 3,
   'min_samples_split': 2},
  

In [67]:
params = final_params[(200, 'tpe')]

clf = ExtraTreesClassifier(**params, random_state=42, n_jobs=-1).fit(X_train, y_train)
y_pred_opt = clf.predict(X_test)
metric = f1_score(y_test, y_pred_opt)
metric

0.8936170212765957

In [38]:
f1_baseline

0.8913043478260869

In [60]:
from statsmodels.stats.contingency_tables import mcnemar

In [81]:
A = ((y_pred_baseline == y_test) & (y_pred_opt == y_test)).sum()
B = ((y_pred_baseline != y_test) & (y_pred_opt == y_test)).sum()
C = ((y_pred_baseline == y_test) & (y_pred_opt != y_test)).sum()
D = ((y_pred_baseline != y_test) & (y_pred_opt != y_test)).sum()

contingency_table_df=pd.DataFrame(data={"nr_correct_clf1":["Yes/Yes","No/Yes"], "nr_incorrect_cl1":["Yes/No","No/No"]}, index=["nr_correct_clf2","nr_incorrect_clf2"])
contingency_table_df.iloc[0,0]=A
contingency_table_df.iloc[0,1]=B
contingency_table_df.iloc[1,0]=C
contingency_table_df.iloc[1,1]=D
contingency_table_df

Unnamed: 0,nr_correct_clf1,nr_incorrect_cl1
nr_correct_clf2,56940,2
nr_incorrect_clf2,2,18


In [82]:
B + C

4

After calculating the Contingency Table, we need to define your hypothesis:
*   H0: both models have the same performance
*   H1: performances of the two models are not equal

But we can't use McNemar

Lets compare on crossvalidation

In [77]:
from tqdm import tqdm

params_list = [final_params[(20, 'tpe')], final_params[(200, 'tpe')], {}]
N_FOLDS: int = 5
    
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

results = np.zeros((len(params_list), N_FOLDS))
cur_fold = 0
np.random.seed(RANDOM_SEED)

for train_index, test_index in skf.split(X, y):
    X_train_cv, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_val = y.iloc[train_index], y.iloc[test_index]
    
    for i, params in tqdm(enumerate(params_list)):
        clf = ExtraTreesClassifier(**params, random_state=42, n_jobs=-1).fit(X_train_cv, y_train_cv)
        score = f1_score(y_val, clf.predict(X_val))
        results[i, cur_fold] = score
    cur_fold += 1

3it [00:37, 12.63s/it]
3it [00:36, 12.18s/it]
3it [00:33, 11.17s/it]
3it [00:35, 11.93s/it]
3it [00:35, 11.89s/it]


In [78]:
results = pd.DataFrame(results)
results

Unnamed: 0,0,1,2,3,4
0,0.808511,0.896175,0.891304,0.864865,0.87234
1,0.814815,0.908108,0.896175,0.864865,0.877005
2,0.839779,0.88764,0.870056,0.853933,0.861878


In [79]:
ranks = pd.DataFrame()
ranks['r1'] = results.loc[:, 0].rank(ascending=False)
ranks['r2'] = results.loc[:, 1].rank(ascending=False)
ranks['r3'] = results.loc[:, 2].rank(ascending=False)
ranks['r4'] = results.loc[:, 3].rank(ascending=False)
ranks['r5'] = results.loc[:, 4].rank(ascending=False)
ranks['mean'] = ranks.mean(axis=1)
ranks

Unnamed: 0,r1,r2,r3,r4,r5,mean
0,3.0,2.0,2.0,1.5,2.0,2.1
1,2.0,1.0,1.0,1.5,1.0,1.3
2,1.0,3.0,3.0,3.0,3.0,2.6


As we see the second configuration is better than others including baseline

In [80]:
final_params[(200, 'tpe')]

{'bootstrap': False,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 129,
 'max_features': 0.9,
 'min_samples_leaf': 3,
 'min_samples_split': 2}