In [26]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import hmean

import gini
from util_data import DataSet

In [28]:
def target_encode(trn_series,    
                  tst_series,
                  target,
                  min_samples_leaf=1,
                  smoothing=1):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()

    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_trn_series.index = trn_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    
    return ft_trn_series, ft_tst_series

In [29]:
correlated_features = ["ps_reg_03","ps_car_13"]
lacunar_features = ["ps_car_03_cat","ps_car_05_cat"]
combination_features = ['ps_reg_01', 'ps_car_02_cat', 'ps_car_04_cat']

In [30]:
data = DataSet()

train = data.get_training_set()
test = data.get_testing_set()

In [32]:
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

for f1, f2 in combs:
    name1 = f1 + "_plus_" + f2
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))
    train_features.append(name1)

    
categorical_features = (train.filter(like='cat')).columns

## Test

In [34]:
MAX_ROUNDS = 400
LEARNING_RATE = 0.07

rfc = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )


In [35]:
fd = KFold(25, shuffle = True)
ginis = []
train = train.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1)
train = test.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1)

categorical_features = (train.filter(like='cat')).columns
ones = train[train["target"]==1].index.values
zeros = train[train["target"]==0].index.values

X_train, X_test, Y_train = train.drop(["id","target"],axis=1),test.drop(["id"],axis=1),train.target

split = fd.split(X_train.iloc[zeros])

In [36]:
Y_preds = np.zeros((X_test.shape[0],5))

for i,(tr_i, tst_i) in enumerate(split):
    local_train = X_train.iloc[np.concatenate([tr_i,ones]),:][train_features].copy()
    local_test = X_test[train_features].copy()
    
    for f in categorical_features:
        local_train[f + "_avg"],local_val[f + "_avg"],local_test[f + "_avg"] = target_encode(
                                                            trn_series=local_train[f],
                                                            tst_series=local_test[f],
                                                            target=Y_train[np.concatenate([tr_i,ones])],
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            )
        local_train.drop(f,axis=1)
        local_test.drop(f,axis=1)

    
    fitrfc = rfc.fit(local_train,Y_train[np.concatenate([tr_i,ones])])
    
    # We make prediction
    Y_preds[:,i] = fitrfc.predict_proba(local_test)[:,1]
    
    del local_train, local_test


Gini: 0.145, Max. Gini: 0.482, Normalized Gini: 0.301
Gini: 0.151, Max. Gini: 0.482, Normalized Gini: 0.313
Gini: 0.148, Max. Gini: 0.481, Normalized Gini: 0.307
Gini: 0.147, Max. Gini: 0.482, Normalized Gini: 0.304
Gini: 0.148, Max. Gini: 0.482, Normalized Gini: 0.307


In [37]:
res = []
k=0
total = sum(1 for i in ginis)
for i in test.id:
    res.append([i,np.mean(Y_preds[k,:])])
    k+=1

pd.DataFrame(res,columns=["id","target"]).to_csv("prednorm.csv",index = False)