In [3]:
MAX_ROUNDS = 2000
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50 

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
import data_util 

@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


train_df = data_util.load_train_data()
test_df = data_util.load_test_data()

train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]

# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)


# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()

combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)

        
    
#########################
# train_df['na_sum'] = (train_df == -1).sum(axis=1)
# train_df['na_ca_t_sum'] = (train_df[f_cats] == -1).sum(axis=1)
# test_df['na_sum'] = (test_df == -1).sum(axis=1)
# test_df['na_ca_t_sum'] = (test_df[f_cats] == -1).sum(axis=1)

# train_features += ['na_ca_t_sum', 'na_sum']

train_df['ps_car_13_+_ps_reg_03'] = train_df['ps_car_13'] + train_df['ps_reg_03']
train_df['ps_car_13_-_ps_reg_03'] = train_df['ps_car_13'] - train_df['ps_reg_03']
train_df['ps_car_13_x_ps_reg_03'] = train_df['ps_car_13'] * train_df['ps_reg_03']
train_df['ps_car_13_/_ps_reg_03'] = train_df['ps_car_13'] / train_df['ps_reg_03']

test_df['ps_car_13_+_ps_reg_03'] = test_df['ps_car_13'] + test_df['ps_reg_03']
test_df['ps_car_13_-_ps_reg_03'] = test_df['ps_car_13'] - test_df['ps_reg_03']
test_df['ps_car_13_x_ps_reg_03'] = test_df['ps_car_13'] * test_df['ps_reg_03']
test_df['ps_car_13_/_ps_reg_03'] = test_df['ps_car_13'] / test_df['ps_reg_03']


train_features+= ['ps_car_13_x_ps_reg_03', 'ps_car_13_/_ps_reg_03','ps_car_13_+_ps_reg_03','ps_car_13_-_ps_reg_03']
#########################
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

y_valid_pred = 0*y
y_test_pred = 0

# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

# Set up classifier
model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=6,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=0.2,
                        colsample_bytree=.8,
                        eta=0.03,
                        gamma=9,
                        reg_alpha=1.5,
                        reg_lambda=1.5,
                     )

                        #scale_pos_weight=1.6,

# Run CV
y_test_pred_all = np.zeros((len(test_df),5))

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric='auc',
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred_all[:,i] = fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred = y_test_pred_all.mean(axis=1)   # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)

val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

# Create submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('submission1.csv', float_format='%.6f', index=False)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0
Fold  0
  Gini =  0.2703606613911892

Fold  1
  Gini =  0.270001302590679

Fold  2
  Gini =  0.2614763598190103

Fold  3
  Gini =  0.2863712227339569

Fold  4
  Gini =  0.27514977994995016

Gini for full training set:


In [9]:
test_df = pd.DataFrame(y_test_pred_all)

test_df["target"] = (test_df.rank() / test_df.shape[0]).mean(axis=1)
test_df.drop([0,1,2,3,4], axis=1, inplace=True)
test_df['id'] = id_test
sub.to_csv('submission2.csv', float_format='%.6f', index=False)

##LB 0.282

In [7]:
test_df.shape

(892816, 2)

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

Fold  0
  Gini =  0.2854809390812456

Fold  1
  Gini =  0.2811618880082646

Fold  2
  Gini =  0.2752654261724574

Fold  3
  Gini =  0.29924437297640194

Fold  4
  Gini =  0.285688230990234

Gini for full training set

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
import data_util 
from bayes_opt import BayesianOptimization


@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

train_df = data_util.load_train_data()
test_df = data_util.load_test_data()

train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()

combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

y_valid_pred = 0*y
y_test_pred = 0

xgbtrain = xgb.DMatrix(X, label= y)

# Set up classifier

num_rounds = 3000
random_state = 2000
num_iter = 25
init_points = 5
params = {
    'silent': 1,
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'verbose_eval': True,
    'seed': random_state
}

def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 subsample,
                 gamma,
                reg_lambda,
                reg_alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0) #minimum loss reduction required to make a further partition on a leaf node of
                                    #the tree. The larger, the more conservative the algorithm will be.
    params['reg_lambda'] = max(reg_lambda, 0)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['eta'] = 0.03
    params['max_depth'] = 6
    
    
    cv_result = xgb.cv(params, xgbtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             early_stopping_rounds=50)
    
    val_score = cv_result['test-auc-mean'].iloc[-1]
    train_score = cv_result['train-auc-mean'].iloc[-1]
    print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' % 
          ( len(cv_result), train_score, val_score, (train_score - val_score), (train_score*2-1),(val_score*2-1)) )
    
    
    return cv_result['test-auc-mean'].values[-1]


xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (0, 20),
                                            'colsample_bytree': (0.1, 1),
                                            'subsample': (0.5, 1),
                                            'gamma': (0, 10),
                                            'reg_lambda': (0, 10),
                                            'reg_alpha':(0, 10)
                                            })


xgbBO.explore({
              'gamma':                [0.5, 8, 0.2, 9, 0.5, 8, 0.2, 9],
              'min_child_weight':     [0.2, 0.2, 0.2, 0.2, 12, 12, 12, 12],
              'subsample':            [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'colsample_bytree':     [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'reg_lambda':           [0.1, 0.5, 1, 1.5, 0.1, 0.5, 1, 1.5],
              'reg_alpha':            [0.1, 0.5, 1, 1.5, 0.1, 0.5, 1, 1.5]
              })



xgbBO.maximize(init_points=init_points, n_iter=num_iter)


current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 
 Stopped after 325 iterations with train-auc = 0.705605 val-auc = 0.641126 ( diff = 0.064479 ) train-gini = 0.411210 val-gini = 0.282252
    1 | 05m57s | [35m   0.64113[0m | [32m            0.6000[0m | [32m   0.5000[0m | [32m            0.2000[0m | [32m     0.1000[0m | [32m      0.1000[0m | [32m     0.6000[0m | 
 Stopped after 560 iterations with train-auc = 0.694964 val-auc = 0.642879 ( diff = 0.052085 ) train-gini = 0.389928 val-gini = 0.285758
    2 | 10m23s | [35m   0.64288[0m | [32m            0.8000[0m | [32m   8.0000[0m | [32m            0.2000[0m | [32m     0.5000[0m | [32m      0.5

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   16 | 16m47s |    0.64316 |             0.8032 |    9.6860 |             0.9558 |      0.0260 |       9.8890 |      0.6319 | 
 Stopped after 396 iterations with train-auc = 0.689744 val-auc = 0.642290 ( diff = 0.047454 ) train-gini = 0.379489 val-gini = 0.284580


  " state: %s" % convergence_dict)


   17 | 08m07s |    0.64229 |             0.9184 |    0.1238 |             0.2118 |      8.8873 |       9.7835 |      0.9941 | 
 Stopped after 3000 iterations with train-auc = 0.641711 val-auc = 0.636260 ( diff = 0.005451 ) train-gini = 0.283422 val-gini = 0.272520


  " state: %s" % convergence_dict)


   18 | 27m27s |    0.63626 |             0.1020 |    9.9536 |            19.9692 |      9.3889 |       9.0639 |      0.5265 | 
 Stopped after 409 iterations with train-auc = 0.687393 val-auc = 0.642048 ( diff = 0.045345 ) train-gini = 0.374786 val-gini = 0.284095


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   19 | 09m05s |    0.64205 |             0.8341 |    0.8274 |             0.2583 |      9.7055 |       0.0398 |      0.5119 | 
 Stopped after 387 iterations with train-auc = 0.696906 val-auc = 0.642523 ( diff = 0.054384 ) train-gini = 0.393813 val-gini = 0.285046


  " state: %s" % convergence_dict)


   20 | 07m31s |    0.64252 |             0.8373 |    2.1814 |            19.9216 |      0.7211 |       0.1966 |      0.9909 | 
 Stopped after 325 iterations with train-auc = 0.692765 val-auc = 0.641853 ( diff = 0.050912 ) train-gini = 0.385530 val-gini = 0.283706


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   21 | 07m06s |    0.64185 |             0.9996 |    2.2988 |             4.2164 |      1.2859 |       4.4718 |      0.9978 | 
 Stopped after 2053 iterations with train-auc = 0.667644 val-auc = 0.642621 ( diff = 0.025022 ) train-gini = 0.335288 val-gini = 0.285243
   22 | 29m19s |    0.64262 |             0.4503 |    9.9117 |            19.4087 |      2.6403 |       0.3963 |      0.5040 | 
 Stopped after 439 iterations with train-auc = 0.690181 val-auc = 0.642328 ( diff = 0.047854 ) train-gini = 0.380362 val-gini = 0.284655


  " state: %s" % convergence_dict)


   23 | 07m22s |    0.64233 |             0.4938 |    0.0836 |            18.8760 |      0.0207 |       9.7771 |      0.5218 | 
 Stopped after 453 iterations with train-auc = 0.686038 val-auc = 0.642118 ( diff = 0.043920 ) train-gini = 0.372075 val-gini = 0.284236
   24 | 09m03s |    0.64212 |             0.7326 |    0.7545 |            19.4815 |      9.7535 |       0.0422 |      0.5150 | 
 Stopped after 420 iterations with train-auc = 0.701860 val-auc = 0.642568 ( diff = 0.059291 ) train-gini = 0.403719 val-gini = 0.285137


  " state: %s" % convergence_dict)


   25 | 08m54s |    0.64257 |             0.7529 |    0.4131 |             0.0548 |      0.5871 |       9.9772 |      0.5610 | 


In [10]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
import data_util 
from bayes_opt import BayesianOptimization


@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

train_df = data_util.load_train_data()
test_df = data_util.load_test_data()

train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()

combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

y_valid_pred = 0*y
y_test_pred = 0

f_cats = [f for f in X.columns if "_cat" in f]

# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

# Set up classifier

                        #scale_pos_weight=1.6,

# Run CV
# Set up classifier

num_rounds = 3000
random_state = 2000
num_iter = 25
init_points = 5
params = {
    'silent': 1,
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'verbose_eval': True,
    'seed': random_state
}

def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 subsample,
                 gamma,
                reg_lambda,
                reg_alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0) #minimum loss reduction required to make a further partition on a leaf node of
                                    #the tree. The larger, the more conservative the algorithm will be.
    params['reg_lambda'] = max(reg_lambda, 0)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['eta'] = 0.03
    params['max_depth'] = 6
    
    model = XGBClassifier(  
                        n_estimators=MAX_ROUNDS,
                        **params
                     )
    kf = KFold(n_splits = K, random_state = 1, shuffle = True)
    y_test_pred_all = np.zeros((len(test_df),5))
    ###############################
    gini = 0
    for i, (train_index, test_index) in enumerate(kf.split(train_df)):

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        X_test = test_df.copy()
        print( "\nFold ", i)


        for f in f_cats:
            X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                            trn_series=X_train[f],
                                                            val_series=X_valid[f],
                                                            tst_series=X_test[f],
                                                            target=y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            noise_level=0
                                                            )

        # Run model for this fold
        if OPTIMIZE_ROUNDS:
            eval_set=[(X_valid,y_valid)]
            fit_model = model.fit( X_train, y_train, 
                                   eval_set=eval_set,
                                   eval_metric='auc',
                                   early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                                   verbose=False
                                 )
            print( "  Best N trees = ", model.best_ntree_limit )
            print( "  Best gini = ", model.best_score )
        else:
            fit_model = model.fit( X_train, y_train )

        # Generate validation predictions for this fold
        pred = fit_model.predict_proba(X_valid)[:,1]
        print( "  Gini = ", eval_gini(y_valid, pred) )
        gini+= eval_gini(y_valid, pred)
        # Accumulate test set predictions

    ###############################
    
    avg_gini = gini/5
    print('The avg gini value is {}'.format(avg_gini))
    
    return avg_gini

xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (0, 20),
                                            'colsample_bytree': (0.1, 1),
                                            'subsample': (0.5, 1),
                                            'gamma': (0, 10),
                                            'reg_lambda': (0, 10),
                                            'reg_alpha':(0, 10)
                                            })


xgbBO.explore({
              'gamma':                [0.5, 8, 0.2, 9, 0.5, 8, 0.2, 9],
              'min_child_weight':     [0.2, 0.2, 0.2, 0.2, 12, 12, 12, 12],
              'subsample':            [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'colsample_bytree':     [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
              'reg_lambda':           [0.1, 0.5, 1, 1.5, 0.1, 0.5, 1, 1.5],
              'reg_alpha':            [0.1, 0.5, 1, 1.5, 0.1, 0.5, 1, 1.5]
              })



xgbBO.maximize(init_points=init_points, n_iter=num_iter,acq='ei', xi=0.05)


current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 

Fold  0
  Gini =  0.153944495242336

Fold  1
  Gini =  0.1542559131554726

Fold  2
  Gini =  0.14905018530024072

Fold  3
  Gini =  0.161701455099305

Fold  4
  Gini =  0.16959644611209712
The avg gini value is 0.1577096989818903
    1 | 182m39s | [35m   0.15771[0m | [32m            0.6000[0m | [32m   0.5000[0m | [32m            0.2000[0m | [32m     0.1000[0m | [32m      0.1000[0m | [32m     0.6000[0m | 

Fold  0
  Gini =  0.2494474541692855

Fold  1


KeyboardInterrupt: 