In [20]:
# Imports
from multiprocessing import Pipe, Process

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score

from hyperopt import Trials, STATUS_OK, tpe, hp, fmin, space_eval

import pandas as pd

In [4]:
df = pd.read_csv('train.csv')

In [6]:
pred = ['Age','Parch','Pclass']
target = 'Survived'

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df[pred].fillna(0), df[target], test_size=0.0, random_state=42)

In [70]:
def run_training(df, pred, target, max_evals=10):
    
    X_train, y_train = df[pred].fillna(0), df[target]
    
    # Classifier
    rf = RandomForestClassifier(n_jobs=-1, random_state=42)

    space_rf = {
        'max_features':hp.pchoice('max_features', [
            (0.2, 'sqrt'),  # most common choice.
            (0.1, 'log2'),  # less common choice.
            (0.1, None),  # all features, less common choice.
            (0.6, hp.uniform('max_features' + '.frac', 0., 1.))
        ]),
        'n_estimators': hp.qloguniform('n_estimators', np.log(9.5), np.log(500.5), 1),
        'min_samples_leaf': hp.qloguniform('min_samples_leaf', np.log(1.5), np.log(50.5), 1),
        'max_depth': hp.choice('max_depth', range(3, 20, 2)),
        'min_samples_split': 2,
        'bootstrap': hp.choice('bootstrap', [True, False]),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    }

    def objective(params):
        shuffle = KFold(n_splits=3, shuffle=True)
        score = cross_val_score(rf, X_train, y_train, cv=shuffle, scoring='roc_auc', n_jobs=-1)
        return 1 - score.mean()

    # The Trials object will store details of each iteration
    trials = Trials()

    # Run the hyperparameter search using the tpe algorithm
    best = fmin(objective,
                space_rf,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)

    if 'max_features.frac' in best.keys():
        best['max_features'] = best['max_features.frac']
        del best['max_features.frac']
    else:
        best['max_features'] = ['sqrt','log2',None][best['max_features']]
        
    best['criterion'] = ['gini', 'entropy'][best['criterion']]
    
    return best

def retrain_best_model(df, pred, target, best_kwargs, seed=42, n_jobs=-1):
    
    X_train, y_train = df[pred].fillna(0), df[target]
    
    best_kwargs['n_jobs'] = n_jobs
    best_kwargs['random_state'] = seed
    best_kwargs['n_estimators'] = int(best_kwargs['n_estimators'])
    best_kwargs['min_samples_leaf'] = int(best_kwargs['min_samples_leaf'])

    # Classifier
    rf = RandomForestClassifier(**best_kwargs)
    rf.fit(X_train, y_train)
    
    return rf

In [72]:
best_args = run_training(df, pred, target, 50)
print(best_args)
retrain_best_model(df, pred, target, best_args)

100%|██████████| 50/50 [00:28<00:00,  1.76trial/s, best loss: 0.2938798725017714] 
{'bootstrap': 1, 'criterion': 'gini', 'max_depth': 6, 'max_features': 0.7619769666982357, 'min_samples_leaf': 2.0, 'n_estimators': 76.0}


RandomForestClassifier(bootstrap=1, max_depth=6,
                       max_features=0.7619769666982357, min_samples_leaf=2,
                       n_estimators=76, n_jobs=-1, random_state=42)