In [7]:
import numpy as np 
import pandas as pd 
import gc
import utils as u
import xgboost as xgb
from sklearn.metrics import f1_score, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from plotnine import (element_blank, scale_color_manual, scale_x_continuous, ggplot, aes, geom_line ,geom_bar, geom_point, theme, element_text, labs, ggtitle, scale_y_continuous, coord_flip, ggsave)
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [None]:
# defin path and load train and test data 
final_data_path = 'final data/'
tr = pd.read_csv(final_data_path + 'train_data_final.csv', index_col = 0)
te = pd.read_csv(final_data_path + 'test_data_final.csv', index_col = 0)
te_dm = xgb.DMatrix(te.drop(['Id'], axis = 1))
gc.collect()

In [29]:
# train_x, train_y 
x = tr.drop(['Id', 'Response'], axis = 1)
y = tr['Response']
print(x.shape)
print(y.shape)

(1183747, 21)
(1183748, 21)


In [None]:
def hp_train_model(x: pd.DataFrame, y: pd.DataFrame, n_tree:int, early_stopping_rounds:int, params = None, te = te):
    
    mcc_scores = []
    evals_result = {}

    tr_x, va_x, tr_y, va_y = train_test_split(x, y, test_size = 0.2, random_state = 70, shuffle = True)
    dtrain = xgb.DMatrix(tr_x, label = tr_y)
    dvalid = xgb.DMatrix(va_x, label = va_y)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    model = xgb.train(params = params, 
                      dtrain = dtrain, 
                      num_boost_round = n_tree, 
                      evals = watchlist, 
                      verbose_eval=1, 
                      early_stopping_rounds=early_stopping_rounds, 
                      evals_result=evals_result,
                      feval = u.evalerror
                      )
        
    va_pred = model.predict(dvalid)
    _, mcc_value= u.best_thr_mcc(va_pred, dvalid)
    mcc_scores.append(mcc_value)

    history.append((params, mcc_value))

    print('mcc value: {}'.format(round(mcc_value, 3)))
    gc.collect()

    return np.mean(mcc_scores)

def hp_xgb_model(params):
    
    n_tree = int(params['n_tree'])
    early_stopping_rounds = int(round(3/ (1/params['eta']), 0))

    xgb_params = {'colsample_bytree': params['colsample_bytree'], 
                  'eta': 1/params['eta'],
                  'max_depth': int(params['max_depth']),
                  'subsample': params['subsample'],
                  'min_child_weight': int(params['min_child_weight']), 
                  'gamma': params['gamma']/ 10,
                #   'alpha': params['alpha'],
                  'objective': 'binary:logistic',
                  'random_state': 123,
                  'disable_default_eval_metric': 1,
                  'ntree': n_tree, 
                  'early_stopping_rounds': early_stopping_rounds
                  }
                        
    score = train_model(x = x, y = y, n_tree = n_tree, 
                        early_stopping_rounds = early_stopping_rounds, 
                        params = xgb_params)
    return {'loss':-score, 'status': STATUS_OK }


In [None]:
space = {
         'n_tree'   : hp.quniform('n_tree', 30, 100, 10),
         'min_child_weight': hp.quniform('min_child_weight', 1, 8, 1),
         'max_depth': hp.quniform('max_depth', 3, 12, 2),
         'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
         'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
         'eta'      : hp.uniform('eta', 10, 100),
         'gamma'    : hp.randint('gamma', 2),
        #  'alpha'    : hp.loguniform('alpha', np.log(1e-5), np.log(1e2))
        }

history = []
trials = Trials()
best = fmin(fn = hp_xgb_model, space = space, algo = tpe.suggest, max_evals = 70, trials = trials)
print('='*20)
# print(sorted(history, key = lambda tpl: tpl[1])[0][0])
# print(sorted(history, key = lambda tpl: tpl[1])[0][1])
print(best)

In [None]:
def train_model(x: pd.DataFrame, y: pd.DataFrame, n_fold: int, n_tree:int, early_stopping_rounds:int, params = None, te = te):
    
    kf = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state = 123)

    scores = {'fold':[], 'mcc':[], 'g_means':[], 'auc_scores':[], 'f1_scores':[], 'threshold':[]}

    evals_result = {}
    loss_data = pd.DataFrame()
    #roc_data  = pd.DataFrame()
    predictions = np.zeros(len(te))

    for fold_i, (tr_idx, va_idx) in enumerate(kf.split(x, y)):
    
        temp_loss_data = pd.DataFrame()
        #temp_roc_data  = pd.DataFrame()

        tr_x, va_x = x.iloc[tr_idx], x.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]

        dtrain = xgb.DMatrix(tr_x, label = tr_y)
        dvalid = xgb.DMatrix(va_x, label = va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

        model = xgb.train(params = params, 
                          dtrain = dtrain, 
                          num_boost_round = n_tree, 
                          evals = watchlist, 
                          verbose_eval=1, 
                          early_stopping_rounds=early_stopping_rounds, 
                          evals_result=evals_result,
                          feval = u.evalerror
                         )
        
        va_pred = model.predict(dvalid)

        #temp_roc_data = pd.DataFrame({'va_test':va_y, 'va_pred':va_pred})

        prediction = model.predict(xgb.DMatrix(te.drop(['Id'], axis = 1)))
        predictions += prediction
        
        if (fold_i + 1) == n_fold:
            predictions /= (fold_i + 1)

        best_thresh, mcc_value= u.best_thr_mcc(va_pred, dvalid)
        y_pred = np.array([1 if y_pro > best_thresh else 0 for y_pro in va_pred])

        tn, fp, fn, tp = confusion_matrix(va_y, y_pred).ravel()
        spec = tn / (tn + fp)
        sens = tp / (tp + fn)
        g_means = np.sqrt(spec * sens)
        fmeasure = f1_score(va_y, y_pred)
        auc_score = roc_auc_score(va_y, y_pred)

        temp_loss_data = pd.DataFrame({'train': list(evals_result['train'].values())[0],
                                    'test': list(evals_result['eval'].values())[0],
                                    'tree': [i+1 for i in range(len(list(evals_result['train'].values())[0]))]})

        print('mcc value: {}'.format(round(mcc_value, 3)))
        print('g_means value: {}'.format(round(g_means, 3)))
        print('auc value: {}'.format(round(auc_score, 3)))
        print('f1_score value: {}'.format(round(fmeasure, 3)))
        print('threshold: {}'.format(round(best_thresh, 3)))
        
        loss_data = pd.concat([loss_data, temp_loss_data], axis = 0).reset_index(drop=True)
        scores['fold'].append(fold_i + 1)
        scores['mcc'].append(mcc_value)
        scores['g_means'].append(g_means)
        scores['f1_scores'].append(fmeasure)
        scores['auc_scores'].append(auc_score)
        scores['threshold'].append(best_thresh)
        
        gc.collect()
    
    return model, scores, loss_data, predictions, np.mean(scores['mcc'])

def xgb_model(params):
    
    n_fold = 5
    n_tree = int(params['n_tree'])
    early_stopping_rounds = int(round(5/ (1/params['eta']), 0))

    xgb_params = {'colsample_bytree': params['colsample_bytree'], 
                  'eta': 1/params['eta'],
                  'max_depth': int(params['max_depth']),
                  'subsample': params['subsample'],
                  'min_child_weight': int(params['min_child_weight']), 
                  'gamma': params['gamma']/ 10,
                  'ntree': n_tree, 
                  'early_stopping_rounds': early_stopping_rounds,
                  'objective': 'binary:logistic',
                  'random_state': 123,
                  'disable_default_eval_metric': 1
                  }
                        
    model, scores, loss_data, predictions, mcc = train_model(x = x, y = y, n_fold = n_fold, n_tree = n_tree, 
                                                             early_stopping_rounds = early_stopping_rounds, 
                                                             params = xgb_params)
    return model, scores, loss_data, predictions, mcc

In [None]:
model, scores, loss_data, predictions, mcc = xgb_model(best)
pd.DataFrame(scores)

In [None]:
# model = xgb.Booster()
# model.load_model(path + "xgboost_model.json")
fscore = model.get_score(importance_type = 'gain')

keys = list(fscore.keys())
values = list(fscore.values())
feat_imp = pd.DataFrame({'feature': keys, 'scores': values}, index=keys).sort_values(by = "scores", ascending=True)
feat_imp['feature'] = pd.Categorical(
feat_imp.feature, categories=pd.unique(feat_imp.feature))
feat_imp['score'] = feat_imp['scores'].transform(lambda x: (x/float(x.sum())*100))

g = (
    ggplot(feat_imp.iloc[-10:, :])
    + geom_bar(aes(x = 'feature', y = 'score'), stat = "identity", color='#7cc8e9', fill='#7cc8e9', width = 0.5)
    + theme(axis_text_x = element_text(angle = 0, size = 13),
            axis_text_y=element_text(size=13),
            plot_title=element_text(size=18))
    + scale_y_continuous(breaks = range(0, 100, 5))
    + coord_flip()
    + labs(x = 'features',y = 'importance percentages (%)')
    + ggtitle('Features importance of categorical data')
)
# ggsave(file="Features importance of numeric data.svg", plot = g, width = 10, height = 8, dpi = 500, format = 'svg')
g

In [None]:
loss_data[['train', 'test']] = loss_data[['train', 'test']] * -1
loss_data = loss_data.groupby('tree')['train', 'test'].mean().reset_index()
loss_data = pd.melt(loss_data, id_vars = ['tree'], value_vars = ['train', 'test'])
loss_data['variable'] = loss_data['variable'].astype('category')

g = (ggplot(loss_data, aes(x = 'tree', y = 'value', group = 'variable', color = 'variable', linetype = 'variable')) 
     + geom_line(size = 1.5)
    #  + geom_smooth(method = 'lm') 
     + theme(axis_text_x = element_text(size = 13),
             axis_text_y = element_text(size = 13),
             axis_title = element_text(size = 14),
             legend_text = element_text(size = 14),
             legend_title = element_blank())
     + scale_x_continuous(breaks = range(1, 76, 5))
     + scale_y_continuous(breaks = np.linspace(0, 1.0, 21, endpoint = True))
     + scale_color_manual(values=["#619ED6", "#6BA547"])
     + labs(x = 'number of rounds', y = 'MCC')
     + ggtitle('XGBoost MCC vs number of rounds')
    )
g

In [None]:
path = 'model/2.xgboost/'
pd.DataFrame({'parameters': best}).to_csv(path + 'best_parameters.csv', index = True)
# best = pd.read_csv(path + 'best_parameters.csv', index_col = 0).to_dict

pd.DataFrame(scores).to_csv(path + 'xgboost_scores.csv', index = False)
loss_data.to_csv(path + 'xgboost_loss_data.csv', index = False)
model.save_model(path + "xgboost_model.json")

# private: 0.46694
submission = pd.DataFrame({'Id': te['Id'], 'Response': predictions > np.mean(scores['threshold'])})
submission['Response'] = submission['Response'].astype('int')
submission.to_csv(path + 'submission_xgboost.csv', index = 0)