In [1]:
import os
import pickle
import copy

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import torch
from torch.nn import CrossEntropyLoss

from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler
from hyperopt import fmin, hp, tpe, Trials, space_eval

In [2]:
class CFG:
    DATA_PATH = '../data'
    PATH_TO_SAVE = '../models/'
    
    exp_name = 'CatboostBaggingROSAlpha_best_params'
    
    hyperopt_space = {'learning_rate' : hp.uniform('learning_rate', 1e-3, 5e-2),
                    'n_estimators' : hp.randint('n_estimators', 200, 1300),
                    'l2_leaf_reg' : hp.choice('l2_leaf_reg', [0.1, 2, 3, 5]),
                    'depth' : hp.randint('depth', 3, 7),
                    'random_strength' : hp.choice('random_strength', [0.5, 1.0, 3.0, 10.0])}
    algo = tpe.suggest
    n_step_search = 30
    
    additional_params={'task_type' : 'GPU', 'devices':'1'}
    n_bag = 100
    random_state = 42
    letter = 'Gamma'

In [3]:
def balanced_ce(y_true, y_pred):
    weights = []
    unique = np.sort(list(set(y_true)))
    for i, t in enumerate(unique):
        n_samples_i = np.sum(y_true == t)
        weights.append(1 / (n_samples_i))
    
    
    ce = CrossEntropyLoss()
    y_pred = torch.Tensor(y_pred.astype(np.float32))
    y_true = torch.Tensor(y_true.astype(np.int64)).long()
    
    return ce(y_pred, y_true).item()

In [4]:
class CatboostBaggingROS:
    def __init__(self, catboost_params, additional_params={'task_type' : 'GPU', 'devices' : '0'}, n_bag=100, random_state=42):
        self.catboost_params = catboost_params
        self.n_bag = n_bag
        
        self.clf = CatBoostClassifier(**catboost_params, random_state=random_state, verbose=0, **additional_params)
        self.clfs = []
        self.oob_idxs = []
        self.losses = []
        
        self.ros = RandomOverSampler(random_state=random_state)
        self.oob_preds = None
        self.oob_n = None
        
    def fit(self, X_train, y_greek):
        n_classes = len(set(y_greek))
        
        self.oob_preds = np.zeros((y_greek.shape[0], n_classes), dtype=np.float32)
        self.oob_n = np.zeros(y_greek.shape[0], dtype=np.float32) + 1e-20
        
        for i in range(self.n_bag):
            bagged_idxs = np.random.randint(0, y_greek.shape[0], y_greek.shape[0])
            oob_idxs = set(list(range(y_greek.shape[0]))) - set(bagged_idxs)
            oob_idxs = list(oob_idxs)
            
            self.oob_idxs.append(oob_idxs)
            
            X_bagged = X_train.iloc[bagged_idxs]
            y_greek_bagged = y_greek.iloc[bagged_idxs]
            
            X_bagged, y_greek_bagged = self.ros.fit_resample(X_bagged, y_greek_bagged)
            
            X_oob = X_train.iloc[oob_idxs]
            
            clf = copy.deepcopy(self.clf)
            clf.fit(X_bagged, y_greek_bagged)
            self.clfs.append(clf)
            
            self.oob_preds[oob_idxs] += clf.predict_proba(X_oob)
            self.oob_n[oob_idxs] += 1
            
            cur_loss = balanced_ce(y_greek, self.oob_preds / self.oob_n.reshape(-1, 1))    
            self.losses.append(cur_loss)

In [5]:
np.random.seed(CFG.random_state)
    
train = pd.read_csv(os.path.join(CFG.DATA_PATH, 'train.csv'), index_col='Id')
greeks = pd.read_csv(os.path.join(CFG.DATA_PATH, 'greeks.csv'), index_col='Id')

train = train.join(greeks[[CFG.letter]])

le = LabelEncoder()
train[CFG.letter] = le.fit_transform(train[CFG.letter])
 
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')

X_train, y_greeks = train.drop(['Class', CFG.letter], axis=1), train[CFG.letter]


In [6]:
def objective(params):   
    clf = CatboostBaggingROS(params, n_bag=CFG.n_bag, random_state=CFG.random_state, additional_params=CFG.additional_params)
    clf.fit(X_train, y_greeks)
    losses = clf.losses
    
    return np.min(losses)

In [7]:
best_params = fmin(
                fn=objective,
                space=CFG.hyperopt_space,
                algo=CFG.algo,
                max_evals=CFG.n_step_search)    

hyperparams = space_eval(CFG.hyperopt_space, best_params)
print(hyperparams)

 13%|█▎        | 4/30 [17:19<1:51:05, 256.38s/trial, best loss: 1.4628958702087402]

job exception: operands could not be broadcast together with shapes (212,8) (212,7) (212,8) 



 13%|█▎        | 4/30 [18:46<2:02:03, 281.69s/trial, best loss: 1.4628958702087402]


ValueError: operands could not be broadcast together with shapes (212,8) (212,7) (212,8) 