In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier, cv
from hyperopt.pyll.base import scope
import hyperopt
import pickle
from sklearn.model_selection import KFold

  return f(*args, **kwds)


In [2]:
train = pd.read_csv('../inputs/train.csv')
test = pd.read_csv('../inputs/test.csv')

In [3]:
train_cols = [
    'ps_car_13',
	'ps_reg_03',
	'ps_ind_05_cat',
	'ps_ind_03',      
	'ps_ind_15',      
	'ps_reg_02',        
	'ps_car_14',  
	'ps_car_12',       
	'ps_car_01_cat',       
	'ps_car_07_cat',       
	'ps_ind_17_bin',      
	'ps_car_03_cat',    
	'ps_reg_01',        
	'ps_car_15',       
	'ps_ind_01',         
	'ps_ind_16_bin',      
	'ps_ind_07_bin',    
	'ps_car_06_cat',     
	'ps_car_04_cat',   
	'ps_ind_06_bin',      
	'ps_car_09_cat',      
	'ps_car_02_cat',     
	'ps_ind_02_cat',    
	'ps_car_11',          
	'ps_car_05_cat',      
	'ps_calc_09',         
	'ps_calc_05',       
	'ps_ind_08_bin',     
	'ps_car_08_cat',   
	'ps_ind_09_bin',     
	'ps_ind_04_cat',    
	'ps_ind_18_bin',    
	'ps_ind_12_bin',     
	'ps_ind_14'  
]

In [4]:
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [5]:
X = train[train_cols]
y = train['target']

In [6]:
cat_feature_inds = []
for index, column in enumerate(X.columns):
    if('cat' in column):
        if(train[column].dtype == np.float64):
            train[column] = train[column].astype(str)
            test[column] = test[column].astype(str)
        cat_feature_inds.append(index)

In [7]:
x_matrix = X.as_matrix()
y_matrix = y.as_matrix()

In [8]:
n_folds=5
kf = KFold(n_splits=n_folds, shuffle=True)

In [9]:
def hyperopt_obj(params):
    print('L2 Reg: ' + str(params['l2_leaf_reg']))
    print('Learning Rate: ' + str(params['learning_rate']))
    print('Depth: ' + str(params['depth']))
    n_folds = 3
    kf = KFold(n_splits=n_folds, shuffle=True)
    total = 0
    trees = 0
    for train_index, test_index in kf.split(x_matrix, y_matrix):
        X_train, X_test = x_matrix[train_index], x_matrix[test_index]
        y_train, y_test = y_matrix[train_index], y_matrix[test_index]
        eval_set = X_test, y_test
        model = CatBoostClassifier(
            iterations=1500,
            loss_function='Logloss',
            eval_metric='AUC',
            use_best_model=True,
            od_type='Iter',
            random_seed=0,
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            depth=params['depth']
        )
        model.fit(X_train, y_train, eval_set=eval_set, cat_features=cat_feature_inds)
        pred = model.predict_proba(X_test)[:,1]
        result = eval_gini(y_test, pred)
        total += result
        trees += model.tree_count_
    avg = total/n_folds
    print('Round completed: ' + str(avg))
    print('Avg Num trees: ' + str(trees/n_folds) + '\n\n')
    return 0.5 - avg

In [10]:
try:
    trials = pickle.load(open('hyperopt_trials.p', 'rb'))
    print('Successfully loaded previous trials')
except:
    trials = hyperopt.Trials()
    print('Creating new trials object')
params_space = {
    'l2_leaf_reg': hyperopt.hp.loguniform('l2_leaf_reg', 0, 3),
    'learning_rate': hyperopt.hp.loguniform('learning_rate', np.log(1e-3), np.log(5e-1)),
    'depth': scope.int(hyperopt.hp.quniform('depth', 1, 16, 1)),
}
best = hyperopt.fmin(
    hyperopt_obj,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=60,
    trials=trials
)

Successfully loaded previous trials
L2 Reg: 5.794745728270712
Learning Rate: 0.31556492477321174
Depth: 10


KeyboardInterrupt: 

In [None]:
pickle.dump(trials, open("hyperopt_trials.p", "wb"))
print(best)