In [2]:
! pip3 install  hyperopt

Defaulting to user installation because normal site-packages is not writeable
Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m[31m8.6 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting future
  Using cached future-0.18.2-py3-none-any.whl
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: py4j, future, hyperopt
Successfully installed future-0.18.2 hyperopt-0.2.7 py4j-0.10.9.7


In [25]:
import pickle as pkl
import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
RANDOM_SEED: int = 42
np.random.seed(RANDOM_SEED)

In [16]:
# Loading train/test datasets and results df
ds_arr = []
paths = ['data/X_train.pkl', 'data/X_test.pkl', 'data/y_train.pkl', 'data/y_test.pkl']
results_df_path = 'model_cmp.csv'


for pth in paths:
    with open(pth, 'rb') as f:
        ds_arr.append(pkl.load(f))
        
        
X_train, X_test, y_train, y_test = ds_arr
results = pd.read_csv(results_df_path)
results = results.set_index('Unnamed: 0')


In [21]:
baseline_folds = results.loc['ExtraTreesClassifier']
baseline_folds

fold_1    0.845070
fold_2    0.863309
fold_3    0.857143
fold_4    0.853333
fold_5    0.853147
Name: ExtraTreesClassifier, dtype: float64

In [26]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [44]:
def objective_cv(params):
    f1_scores = []
    for train_index, test_index in skf.split(X_train, y_train):
        X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
        model = ExtraTreesClassifier(random_state=42, n_jobs=-1, **params).fit(X_train_cv, y_train_cv)
        score = f1_score(y_val, model.predict(X_val))
        f1_scores.append(score)
    return {'loss': -np.mean(f1_scores), 'status': STATUS_OK}

In [45]:
def objective_fast(params):
    X_train_cv, X_val, y_train_cv, y_val =  train_test_split(X_train, y_train, test_size=.2, random_state=RANDOM_SEED)
    model = ExtraTreesClassifier(random_state=42, n_jobs=-1, **params).fit(X_train_cv, y_train_cv)
    score = f1_score(y_val, model.predict(X_val))
    return {'loss': score, 'status': STATUS_OK}

In [41]:
best_params = fmin(
  fn=objective,
  space=search_space,
  algo=algorithm,
  max_evals=10)
best_params

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [12:44<00:00, 76.43s/trial, best loss: -0.3004532200419838]


{'criterion': 2,
 'max_depth': 13,
 'max_features': 1,
 'min_samples_leaf': 4,
 'min_samples_split': 0.04176960300765731,
 'n_estimators': 908}

In [47]:
space_eval(search_space, best_params)

{'criterion': 'log_loss',
 'max_depth': 13,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 0.04176960300765731,
 'n_estimators': 908}

In [48]:
from hyperopt import rand

In [40]:
search_space={'n_estimators':hp.randint('n_estimators',200,1000),
              'max_depth': hp.randint('max_depth',10,200),           
              'min_samples_split':hp.uniform('min_samples_split',0,1),   
             'min_samples_leaf':hp.randint('min_samples_leaf',1,10),
              'criterion':hp.choice('criterion', ['gini','entropy', 'log_loss']),
                'max_features':hp.choice('max_features',['sqrt', 'log2']) }
algorithm=tpe.suggest

In [53]:
def run_experiment(search_space, budget:int = 10, use_cv: bool =True, method: str = 'random') -> float:
    if use_cv:
        objective = objective_cv
    else:
        objective = objective_fast
    
    if method == 'random':
        method = rand.suggest
    elif method == 'tpe':
        method = tpe.suggest
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=method,
        max_evals=budget)
    
    return space_eval(search_space, best_params), objective(space_eval(search_space, best_params))
    
    

In [55]:
run_experiment(search_space, 5, use_cv=False, method='tpe')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:35<00:00, 19.10s/trial, best loss: 0.05714285714285715]


({'criterion': 'log_loss',
  'max_depth': 121,
  'max_features': 'log2',
  'min_samples_leaf': 7,
  'min_samples_split': 0.3344253644309838,
  'n_estimators': 546},
 {'loss': 0.05714285714285715, 'status': 'ok'})