# Parameters tuning with Hyperopt

In [1]:
import os
import json

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from gpalib import model

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [2]:
RANDOM_SEED = 42
LOCAL = True
PATH_TO_LOG_FILE = 'parameter_tuning_logs.txt'
PATH_TO_ALL_HISTORY_FILE = 'parameter_tuning_all_logs.json'

In [3]:
if LOCAL:
    # If working in local environment
    data = pd.read_csv('../data/russia-16-19-v2.5.csv')
    print(data.shape)
    data.head()
else: 
    # If working in Google Colab
    os.mkdir('gpalib')
    os.mkdir('model')

    os.rename("__init__.py", "gpalib/__init__.py")
    os.rename("analysis.py", "gpalib/analysis.py")
    os.rename("model.py", "gpalib/model.py")
    os.rename("preprocessing.py", "gpalib/preprocessing.py")

    data = pd.read_csv('russia-16-19-v2.5.csv')
    print(data.shape)
    data.head()

(308257, 187)


In [4]:
# Train / validation split in proportion 4:1
valid_data = data.sample(frac=0.2, random_state=RANDOM_SEED)
train_data = data.drop(valid_data.index, axis=0)

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# For storing history of models trained
history_storage = {
    'model': [],
    'params': [],
    'score': []
}

In [17]:
def filter_params(params: dict):
    """Loging only important params"""
    
    def round_if_float(value):
        """Round float values"""
        
        if type(value) == float:
            return round(value, 3)
        else:
            return value
        
    params_to_delete = ('type', 'n_jobs', 'random_state', 'verbosity')

    return {
        k: round_if_float(v) for k, v in params.items() 
        if k not in params_to_delete
    }

In [18]:
space = hp.choice('classifier_type', [
    {
        'type': 'LogReg',
        'C': hp.uniform('C', 0, 10.0),
        'penalty': hp.choice('penalty', ['l1', 'l2']),
        'solver': hp.choice('solver', ['saga', 'liblinear']),
        'n_jobs': -1,
        'random_state': RANDOM_SEED
    },
    {
        'type': 'RandForest',
        'max_depth': hp.choice('max_depth-rf', range(1, 11)),
        'max_features': hp.choice('max_features', np.arange(15, 31, 5).tolist() + ['log2', 'sqrt']),
        'n_estimators': hp.choice('n_estimators-rf', range(100, 1001, 100)),
        'criterion': hp.choice('criterion', ["gini", "entropy"]),
        'n_jobs': -1,
        'random_state': RANDOM_SEED
    },
    {
        'type': 'XGBoost',
        'n_estimators': hp.choice('n_estimators', range(100, 1001, 100)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', range(1, 11)),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'random_state': RANDOM_SEED,
        'n_jobs': -1,
        "verbosity": 0,
    }
])

In [19]:
count, best = 0, -100

def f(params):
    """Function for Hyperopt"""
    global best, count
    
    count += 1
    neg_log_los = model.hyperopt_train_test(
        train_data,
        params.copy(),
        history_storage,
        cv=2)
    
    if neg_log_los > best:
        best = neg_log_los
        out_str = 'New best: {:.5f} using {}'.format(
            neg_log_los, 
            filter_params(params)
        )
        
        print(out_str)
        with open(PATH_TO_LOG_FILE, 'a', encoding='utf-8') as file:
            file.write(out_str+'\n')
    
    return {'loss': neg_log_los, 'status': STATUS_OK}

In [20]:
trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=10, trials=trials)

New best: -0.31104 using {'C': 0.435, 'penalty': 'l1', 'solver': 'saga'}
New best: -0.19636 using {'eta': 0.375, 'gamma': 0.8, 'max_depth': 6, 'n_estimators': 700, 'subsample': 1.0}
New best: -0.19632 using {'eta': 0.275, 'gamma': 0.7, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 0.75}
100%|██████████| 10/10 [1:20:13<00:00, 258.69s/it, best loss: -0.4700524725200869] 


In [21]:
# Saving history of models tried
with open(PATH_TO_ALL_HISTORY_FILE, 'w', encoding='utf-8') as file:
    file.write(json.dumps(history_storage, indent=4))

**Hyperopt tutorials:** 
- https://medium.com/district-data-labs/parameter-tuning-with-hyperopt-faa86acdfdce
- https://www.kaggle.com/yassinealouini/hyperopt-the-xgboost-model
- https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb