In [1]:
import pandas as pd
import numpy as np
from usfull_tools import load_DS
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
pd.options.display.max_columns = None
%matplotlib inline

from set_vars import KAGGLE_PREFIX, debug_mode, KAGGLE_DIR, target_column, target_type, loss_function, custom_metric

train, test = load_DS(debug_mode, KAGGLE_DIR, KAGGLE_PREFIX, '_prepare.csv')
del test

cols = pd.read_csv(KAGGLE_DIR + KAGGLE_PREFIX + '_important_columns.csv')

X_train = train[cols['Feature']]
y_train = train[target_column]

X_train.head(10)

Unnamed: 0,Sex,Pclass,Age,Fare,Ticket,Cabin,SibSp,Embarked,Parch
0,male,3,22.0,7.25,A/5 21171,1.0,1,S,0
1,female,1,38.0,71.3125,PC 17599,0.0,1,C,0
2,female,3,26.0,7.925781,STON/O2. 3101282,1.0,0,S,0
3,female,1,35.0,53.09375,113803,0.0,1,S,0
4,male,3,35.0,8.046875,373450,1.0,0,S,0
5,male,3,80.0,8.460938,330877,1.0,0,Q,0
6,male,1,54.0,51.875,17463,0.0,0,S,0
7,male,3,2.0,21.078125,349909,1.0,3,S,1
8,female,3,27.0,11.132812,347742,1.0,0,S,2
9,female,2,14.0,30.078125,237736,1.0,1,C,0


In [2]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import time


i=0
cat_features = []
for column in X_train.columns:
    if X_train[column].dtype == 'object': cat_features.append(i)
    i +=1

def hyperopt_train_test(params):
    if target_type=='binary': clf = CatBoostClassifier(**params)
    elif target_type=='interval': clf = CatBoostRegressor(**params)
    return cross_val_score(clf, X_train, y_train, cv=5).mean()


if target_type=='binary':
    space4CatBoost = {
        'random_seed': 42,
        'iterations': hp.choice('iterations', range(100, 1000, 100)),
        'depth': hp.choice('depth', {1, 2, 4}),
        'l2_leaf_reg': hp.choice('l2_leaf_reg', {1, 2, 3, 4, 6}),
        'learning_rate': hp.uniform('learning_rate', 0.0001, 1),
        'loss_function': loss_function,
        'od_type': 'Iter',
        'verbose': False,
        'cat_features': cat_features
    }
    best_acc = 0

elif target_type=='interval':
    space4CatBoost = {
        'random_seed': 42,
        'iterations': hp.choice('iterations', range(100, 600, 100)),
        'depth': hp.choice('depth', {1, 2, 4}),
        'l2_leaf_reg': hp.choice('l2_leaf_reg', {1, 2, 4}),
        'learning_rate': hp.uniform('learning_rate', 0.0001, 0.8),
        'loss_function': loss_function,
        'od_type': 'Iter',
        'verbose': False,
        'cat_features': cat_features
    }
#     space4CatBoost = {
#         'random_seed': 42,
#         'iterations': hp.choice('iterations', range(100, 1000, 100)),
#         'depth': hp.choice('depth', {1, 2, 4, 6}),
#         'l2_leaf_reg': hp.choice('l2_leaf_reg', {1, 2, 3, 4, 6}),
#         'learning_rate': hp.uniform('learning_rate', 0.0001, 0.8),
#         'loss_function': loss_function,
#         'od_type': 'Iter',
#         'verbose': False,
#         'cat_features': cat_features
#     }
    best_acc = 9999999

best_params = {}

def f(params):
    start_time = time.time()
    global best_params, best_acc
    acc = hyperopt_train_test(params)
    print(acc)
        
    if best_acc < acc and target_type=='binary' or best_acc > acc and target_type=='interval':
        best_acc = acc
        best_params = params
        print('\n\nNew best acc:', best_acc, params)
        if target_type=='interval':
            return {'loss': acc, 'status': STATUS_OK}
        else:
            return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()

best = fmin(f, space4CatBoost, algo=tpe.suggest, max_evals=5, trials=trials)

print ('best_params:', best_params)


0.8058541655805918                                                                                                     
                                                                                                                       

New best acc:
0.8058541655805918                                                                                                     
{'cat_features': (0, 4, 7), 'depth': 4, 'iterations': 900, 'l2_leaf_reg': 2, 'learning_rate': 0.7864249791484214, 'loss_function': 'CrossEntropy', 'od_type': 'Iter', 'random_seed': 42, 'verbose': False}
0.7867150249291879                                                                                                     
 20%|██████████▍                                         | 1/5 [01:43<06:31, 97.94s/it, best loss: -0.8058541655805918]


TypeError: 'NoneType' object is not iterable

In [None]:
if target_type == 'binary': model = CatBoostClassifier(iterations = best_params['iterations'], depth = best_params['depth'], 
                               l2_leaf_reg = best_params['l2_leaf_reg'], learning_rate = best_params['learning_rate'], 
                               loss_function = loss_function, od_type = 'Iter', verbose = False)
    
elif target_type == 'interval': model = CatBoostRegressor(iterations = best_params['iterations'], depth = best_params['depth'], 
                               l2_leaf_reg = best_params['l2_leaf_reg'], learning_rate = best_params['learning_rate'], 
                               loss_function = loss_function, od_type = 'Iter', verbose = False)

model.fit(X_train, y_train, cat_features = cat_features, verbose = False, plot = True)

In [None]:
import pickle
# save the classifier
with open(KAGGLE_DIR + KAGGLE_PREFIX + 'classifier.pkl', 'wb') as fid:
    pickle.dump(model, fid)

In [None]:
print ('trials:')
for trial in trials.trials:
    print (trial)

# 