# Parameters tuning with Hyperopt

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from gpalib import model

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [2]:
RANDOM_SEED = 42

In [3]:
data = pd.read_csv('../data/russia-16-19-v2.5.csv')
print(data.shape)
data.head()

(308273, 187)


Unnamed: 0,sup_running_cntr_num,sup_cntr_avg_price,sup_cntr_avg_penalty_share,sup_no_pnl_share,sup_sim_price_share,sup_good_cntr_share,sup_fed_cntr_share,sup_sub_cntr_share,sup_mun_cntr_share,org_cntr_num,...,socs_90,socs_91,socs_93,socs_94,socs_95,socs_96,socs_97,socs_98,socs_99,cntr_result
0,2.197225,11.950251,0.0,1,0.2,1.0,0.1,0.9,0.0,6.821107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,4.51086,10.899569,0.0,1,0.1,1.0,0.016667,0.5,0.483333,4.094345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,4.564348,11.342588,0.0,1,0.057,0.680851,0.0,0.269504,0.730496,5.117994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.791759,14.949741,0.0,1,0.1,0.8,0.0,0.3,0.7,5.755742,...,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,1
4,1.609438,13.103967,0.0,1,0.231,0.923077,0.0,0.076923,0.923077,1.609438,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
import warnings
warnings.filterwarnings("ignore")

**Hyperopt tutorials:** 
- https://medium.com/district-data-labs/parameter-tuning-with-hyperopt-faa86acdfdce
- https://www.kaggle.com/yassinealouini/hyperopt-the-xgboost-model
- https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb

In [5]:
classifiers = {
    'params': [],
    'score': []
}

def hyperopt_train_test(params):
    clf_type = params['type']
    del params['type']
    
    if clf_type == 'LogReg':
        clf = LogisticRegression(**params)
    elif clf_type == 'RandForest':
        clf = RandomForestClassifier(**params)
    elif clf_type == 'XGBoost': 
        clf = XGBClassifier(**params)
    
    clf_ = model.Classifier(clf, '', clf_type, '')
    
    score = model.cross_validate(clf_, data, ['neg_log_loss'], cv=2, silent=True, prefix='hp')
    
    classifiers['params'].append(params)
    classifiers['score'].append(score)
    
    return np.mean(score['test_neg_log_loss'])

space = hp.choice('classifier_type', [
    {
        'type': 'LogReg',
        'C': hp.uniform('C', 0, 10.0),
        'penalty': hp.choice('penalty', ['l1', 'l2']),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'n_jobs': -1,
        'random_state': RANDOM_SEED
    },
    {
        'type': 'RandForest',
        'max_depth': hp.choice('max_depth-rf', range(1, 20)),
        'max_features': hp.choice('max_features', np.arange(5, 50, 5).tolist() + ['log2', 'sqrt']),
        'n_estimators': hp.choice('n_estimators', range(10, 100, 10)),
        'criterion': hp.choice('criterion', ["gini", "entropy"]),
        'n_jobs': -1,
        'random_state': RANDOM_SEED
    },
    {
        'type': 'XGBoost',
        'n_estimators': hp.choice('n_estimators', range(100, 1000 + 1, 100)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', range(1, 14)),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'random_state': RANDOM_SEED,
        'n_jobs': -1,
        "verbosity": 0,
    }
])

count = 0
best = -10

def f(params):
    global best, count
    
    count += 1
    neg_log_los = hyperopt_train_test(params.copy())
    
    if neg_log_los > best:
        best = neg_log_los
        out_str = 'New best: {} using {}'.format(neg_log_los, params)
        
        print(out_str)
        with open('logs.txt', 'a', encoding='utf-8') as file:
            file.write(out_str+'\n')
    
    return {'loss': neg_log_los, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=150, trials=trials)
print(best)

New best: -0.29708647390761456 using {'criterion': 'gini', 'max_depth': 19, 'max_features': 'log2', 'n_estimators': 70, 'n_jobs': -1, 'random_state': 42, 'type': 'RandForest'}
New best: -0.22143399973543704 using {'criterion': 'entropy', 'max_depth': 17, 'max_features': 25, 'n_estimators': 40, 'n_jobs': -1, 'random_state': 42, 'type': 'RandForest'}
New best: -0.21667600755242553 using {'criterion': 'gini', 'max_depth': 15, 'max_features': 35, 'n_estimators': 40, 'n_jobs': -1, 'random_state': 42, 'type': 'RandForest'}
New best: -0.21598847988006498 using {'criterion': 'entropy', 'max_depth': 17, 'max_features': 30, 'n_estimators': 30, 'n_jobs': -1, 'random_state': 42, 'type': 'RandForest'}
New best: -0.2132708360567251 using {'criterion': 'entropy', 'max_depth': 18, 'max_features': 30, 'n_estimators': 30, 'n_jobs': -1, 'random_state': 42, 'type': 'RandForest'}
New best: -0.20390601136190428 using {'criterion': 'entropy', 'max_depth': 19, 'max_features': 35, 'n_estimators': 80, 'n_jobs':