In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import plot_importance
from matplotlib import pyplot as plt

# Compute Gini metric functions

In [2]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini
    
    
# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

# Target encoding functions

In [3]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

# Read data

In [4]:
# Read data
train_df = pd.read_csv('data/train.csv', na_values="-1") # .iloc[0:200,:]
#test_df = pd.read_csv('data/test.csv', na_values="-1")

In [5]:
for c in train_df.columns:
    if train_df[c].dtype == np.float64:
        train_df[c] = train_df[c].astype(np.float32)
        #test_df[c] = test_df[c].astype(np.float32)
    elif train_df[c].dtype == np.int64:
        train_df[c] = train_df[c].astype(np.int32)
        #if c in test_df:
            #test_df[c] = test_df[c].astype(np.int32)
        
        

# Create the new features

In [6]:

# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
    #"nulls_count" #            : not analized
]



numerical_features = ['ps_ind_01', 
                     'ps_ind_03',
                     'ps_ind_14',
                     'ps_ind_15',
                     'ps_reg_01',
                     'ps_reg_02',
                     'ps_reg_03',
                     'ps_car_11',
                     'ps_car_12',
                     'ps_car_13',
                     'ps_car_14',
                     'ps_car_15',
                     'ps_calc_01',
                     'ps_calc_02',
                     'ps_calc_03',
                     'ps_calc_04',
                     'ps_calc_05',
                     'ps_calc_06',
                     'ps_calc_07',
                     'ps_calc_08',
                     'ps_calc_09',
                     'ps_calc_10',
                     'ps_calc_11',
                     'ps_calc_12',
                     'ps_calc_13',
                     'ps_calc_14']

In [7]:
for f1 in numerical_features:
    print(f1)
    for f2 in numerical_features: 
        if f1 != f2:
            train_df['sum_' + f1 + '_' + f2] = (train_df[f1] + train_df[f2]).astype(np.float32)
            #test_df['sum_' + f1 + '_' + f2] = (test_df[f1] + test_df[f2]).astype(np.float32)
            #train_df['diff_' + f1 + '_' + f2] = train_df[f1] - train_df[f2]
            #test_df['diff_' + f1 + '_' + f2] = test_df[f1] - test_df[f2]
            #train_df['mult_' + f1 + '_' + f2] = train_df[f1] * train_df[f2]
            #test_df['mult_' + f1 + '_' + f2] = test_df[f1] * test_df[f2]
            #train_df['div_' + f1 + '_' + f2] = train_df[f1] / train_df[f2]
            #test_df['div_' + f1 + '_' + f2] = test_df[f1] / test_df[f2]
            train_features.append('sum_' + f1 + '_' + f2)

ps_ind_01
ps_ind_03
ps_ind_14
ps_ind_15
ps_reg_01
ps_reg_02
ps_reg_03
ps_car_11
ps_car_12
ps_car_13
ps_car_14
ps_car_15
ps_calc_01
ps_calc_02
ps_calc_03
ps_calc_04
ps_calc_05
ps_calc_06
ps_calc_07
ps_calc_08
ps_calc_09
ps_calc_10
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14


# Preprocess data

In [8]:
# Process data
#id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

    
X = train_df[train_features]
#test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

# Fit XGBoost

In [None]:
xgb1_params = {}
xgb1_params['objective'] = "binary:logistic"
xgb1_params['n_estimators'] = 400 # MAX_ROUNDS
xgb1_params['learning_rate'] = 0.07
xgb1_params['max_depth'] = 4
xgb1_params['subsample'] = 0.80
xgb1_params['colsample_bytree'] = 0.80
xgb1_params['min_child_weight'] = 6
xgb1_params['gamma'] = 10
xgb1_params['reg_alpha'] = 8
xgb1_params['reg_lambda'] = 1.5
xgb1_params['scale_pos_weight'] = 1.6
#xgb1_params['max_delta_step'] = 0
xgb1_params['seed'] = 0
xgb1 = XGBClassifier(**xgb1_params)



In [None]:
y_valid_pred = 0*y
#y_test_pred = 0


# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 0, shuffle = True)
# Also try with stratified
np.random.seed(0)

# Run CV

for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    #X_test = test_df.copy()
    print( "\nFold ", i)
    
       
    #print('Parameters: ')
    #print(model_params)
    fit_model = xgb1.fit(X_train, y_train)
    # if xgboost model, save it
    #if 'xgb' in model_name:
    fit_model.booster().dump_model(model_name + '_fold' + str(i) + '.dump',  with_stats=True)

    # Train error
    pred = fit_model.predict_proba(X_train)[:,1]
    print("TrainGini = ", str(eval_gini(y_train, pred)))
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    l_gini_val.append(eval_gini(y_valid, pred))
    print("Val Gini = ", str(eval_gini(y_valid, pred)))
    y_valid_pred.iloc[test_index] = pred 

    # Accumulate test set predictions
    #y_test_pred += fit_model.predict_proba(X_test)[:,1] 

    # Plot importance plot
    fig, ax = plt.subplots(1,1,figsize=figsize)
    plot_importance(booster=booster, ax=ax, **kwargs)
    fig.savefig('xgb_' + str(i))
    
    del X_test, X_train, X_valid, y_train
          
          
        
    print('---------------------------------------END FOLD------------------------------------------')
    
#y_test_pred /= K  # Average test set predictions

print('END KFOLD')
print( "\nGini for full training set:" )
print(eval_gini(y, y_valid_pred))


Fold  0


# LightGBM

# Random Forest