# XGBoolst Modeling 커널 필사
 + Link: [XGBoost CV](https://www.kaggle.com/aharless/xgboost-cv-lb-284/notebook)

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

import warnings
warnings.filterwarnings('ignore')

## Evaluation metric: Gini index

In [4]:
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i*delta
        delta += 1-y_i
    gini = 1-2*gini/(ntrue*(n-ntrue))
    return gini

## Target Encoding
Target encode categorical variables with **high cardinality**

 + [Target Encoding 원리에 대한 참고자료](https://towardsdatascience.com/all-about-target-encoding-d356c4e9e82)
 + 아래 코드는 Target Encoding 과정을 모두 풀어서 코딩했지만, 대안으로 `TargetEncoder` 함수 사용하면 더 쉽고 빠르게 Target Encoding 가능하다.

In [2]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def add_noise(series, noise_level): # noise 역할 - 해당 코드에서는 noise_level을 0으로 설정 --> noise 역할 x
    return series*(1+noise_level*np.random.randn(len(series)))

# 각 categorical variable에 대해 수행 --> Target encoding 원리
def target_encode(trn_series = None,    
                  val_series = None,
                  tst_series = None,
                  target = None,
                  min_samples_leaf = 1,
                  smoothing = 1,
                  noise_level = 0):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis = 1)
    
    # 각 카테고리에 대해 target = 1일 확률 계산
    averages = temp.groupby(trn_series.name)[target.name].agg(['mean', 'count']) 
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    
    # 전체 데이터에서 target = 1일 확률 계산
    prior = target.mean()
    averages[target.name] = prior * (1-smoothing) + averages['mean'] * smoothing
    averages.drop(['mean', 'count'], axis = 1, inplace = True)
    
    ft_trn_series = pd.merge(trn_series.to_frame(trn_series.name),
                             averages.reset_index().rename(columns = {'index': target.name, target.name: 'average'}),
                             on = trn_series.name,
                             how = 'left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_trn_series.index = trn_series.index
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

## Load data and select features

In [5]:
train_df = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv', na_values="-1")

In [6]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations - strongly correlated features
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

## Combine stronly related features and Label Encoding
 1. 설명변수 중 correlation이 강한 변수들의 combination을 생성한다.
 2. combination을 생성한 후 Label Encoding을 한다.

In [8]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + '_plus_' + f2
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + '_' + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + '_' + test_df[f2].apply(lambda x: str(x))
    
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values)) # train, test의 모든 카테고리 고려하여 Label Encoding
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))
    
    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if '_cat' in f]

In [9]:
y_valid_pred = 0*y
y_test_pred = 0

# Cross Validation + Modeling
 + 참고 커널에서는 KFold CV 방법을 사용했으나, EDA에서 target class의 비율이 매우 불균형했기 때문에 한 fold에 클래스 0에 해당하는 데이터만 들어가는 것을 방지하기 위해 **Stratified KFold**로 바꿔주었다. 그 결과, gini index 값이 감소하였다.

In [10]:
K = 5
kf = StratifiedKFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [11]:
MAX_ROUNDS = 500
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  

# XGBoost 모델 정의
model = XGBClassifier(    
                        n_estimators = MAX_ROUNDS,
                        max_depth = 4,
                        objective = "binary:logistic",
                        learning_rate = LEARNING_RATE, 
                        subsample = .8,
                        min_child_weight = 6,
                        colsample_bytree = .8,
                        scale_pos_weight = 1.6,
                        gamma = 10,
                        reg_alpha = 8,
                        reg_lambda = 1.3,
                     )

### Process
 1. CV로 데이터를 총 5개의 폴드로 나눈다.
 2. 각 CV마다 target encoding 후 모델을 학습한다. 이 때 early stopping을 적용할지 말지로 모델 학습 방법이 다르다.
 3. Validation set을 예측한 후, 클래스에 대한 예측 확률값을 구한다.
 4. 5개의 fold에서 각각 Test set을 예측한 후, 예측 확률의 평균을 구한다.

In [17]:
for i, (train_idx, test_idx) in enumerate(kf.split(train_df, y)):
    X_train, X_valid = X.iloc[train_idx, :].copy(), X.iloc[test_idx, :].copy()
    y_train, y_valid = y[train_idx].copy(), y[test_idx].copy()
    X_test = test_df.copy()
    
    print('\nFold ', i)
    
    for f in f_cats:
        X_train[f + '_avg'], X_valid[f + '_avg'], X_test[f + '_avg'] = target_encode(trn_series = X_train[f],
                                                                                     val_series = X_valid[f],
                                                                                     tst_series = X_test[f],
                                                                                     target = y_train,
                                                                                     min_samples_leaf = 200,
                                                                                     smoothing = 10,
                                                                                     noise_level = 0
                                                                                     )
    if OPTIMIZE_ROUNDS:
        eval_set = [(X_valid, y_valid)]
        fit_model = model.fit(X_train, y_train, eval_set = eval_set,
                             eval_metric = gini_xgb, early_stopping_rounds = EARLY_STOPPING_ROUNDS,
                             verbose = False)
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
        
    else:
        fit_model = model.fit(X_train, y_train)
    
    pred = fit_model.predict_proba(X_valid)[:, 1] # class 1에 대한 예측 확률값
    print(' Gini = ', eval_gini(y_valid, pred))
    y_valid_pred.iloc[test_idx] = pred
    
    y_test_pred += fit_model.predict_proba(X_test)[:, 1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K # average prediction probablility 

print('\nGini for full training set:')
eval_gini(y, y_valid_pred)