In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

# Any results you write to the current directory are saved as output.
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import os
import xgboost as xgb
import gc
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer


#
# Prepare the data
#

train = pd.read_csv('train.csv')

# get the labels
y = train.target.values
train.drop(['id', 'target'], inplace=True, axis=1)

x = train.values

#
# Create training and validation sets
#
x, x_test, y, y_test = train_test_split(x, y, test_size=0.6, random_state=13, stratify=y)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42, stratify=y_test)


#
# Create the LightGBM data containers
#
categorical_features = [c for c, col in enumerate(train.columns) if 'cat' in col]
train_data = lightgbm.Dataset(x, label=y, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(x_test, label=y_test)
valid_data = lightgbm.Dataset(x_valid, label=y_valid)


#
# Train the model
#

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model_xgb = lightgbm.train(parameters,
                       train_data,
                       valid_sets=valid_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100,verbose_eval = 100
                          )
#
# Create a submission

In [None]:
def transform(x):
    return np.log(x/(1-x))

In [None]:
def give_ece_data(preds,bins,y_valid):
    sorted_ind = np.argsort(preds)
    predicted_bins = [[] for _ in range(bins)]
    actual_counters = [[] for _ in range(bins)]
    counters = [[] for _ in range(bins)]
    index = 0
    length_array = len(sorted_ind)
    step = 1.*length_array//bins
    for _ in range(bins):
        current = int(step*index)
        next_ = int(step*(index+1))
        predicted_bins[index] = np.mean(preds[sorted_ind[current:next_]])
        actual_counters[index] = np.mean(y_valid[sorted_ind[current:next_]])
        counters[index] = len(y_valid[sorted_ind[current:next_]])
        index += 1
    return predicted_bins,actual_counters,counters

In [None]:
preds = model_xgb.predict(x_valid)
bins = 1000
predicted_bins,actual_counters,counters = give_ece_data(preds,bins,y_valid)

plt.scatter(predicted_bins,actual_counters)
ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
preds = model_xgb.predict(x_test)
bins = 1000

predicted_bins,actual_counters,counters = give_ece_data(preds,bins,y_test)

In [None]:
plt.scatter(predicted_bins,actual_counters)
ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
tree_model = DecisionTreeClassifier(min_samples_leaf=1700,max_depth=5)
tree_model.fit(x,y)

In [None]:
TREE = tree_model.tree_
indexes = TREE.apply(x_valid.astype(np.float32))
predicts_from_xgboost = model_xgb.predict(x_valid)
predicts_from_xgboost = predicts_from_xgboost.reshape((-1,1))

In [None]:
log_reg_dict = {}
nodes = np.unique(indexes)
model = LogisticRegression()
for node in tqdm_notebook(nodes):
    model.fit(transform(predicts_from_xgboost[indexes==node]),y_valid[indexes==node])
    log_reg_dict[node] = model

In [None]:
indexes_test = TREE.apply(x_test.astype(np.float32))
predicts_from_xgboost_test = model_xgb.predict(x_test)
predicts_from_xgboost_test = predicts_from_xgboost_test.reshape((-1,1))
predicts_calibrated = np.zeros_like(predicts_from_xgboost_test)

In [None]:
for node in tqdm_notebook(log_reg_dict.keys()):
    predicts_calibrated[indexes_test==node] = log_reg_dict[node].\
        predict_proba(transform(predicts_from_xgboost_test[indexes_test==node]))[:,1].reshape((-1,1))

In [None]:
bins = 10
predicted_bins,actual_counters,counters = give_ece_data(predicts_calibrated.reshape((-1)),bins,y_test)

plt.scatter(predicted_bins,actual_counters)

ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
bins = 100
predicted_bins,actual_counters,counters = give_ece_data(predicts_calibrated.reshape((-1)),bins,y_test)

plt.scatter(predicted_bins,actual_counters)

ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
bins = 1000

predicted_bins,actual_counters,counters = give_ece_data(predicts_calibrated.reshape((-1)),bins,y_test)

plt.scatter(predicted_bins,actual_counters)

ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
bins = 20

predicted_bins,actual_counters,counters = give_ece_data(predicts_calibrated.reshape((-1)),bins,y_test)

plt.scatter(predicted_bins,actual_counters)

ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
print(roc_auc_score(y_test,predicts_calibrated))
print(roc_auc_score(y_test,predicts_from_xgboost_test))
assert np.sum(predicts_from_xgboost_test) - np.sum(preds) == 0

# Overfit example

In [None]:
tree_model = DecisionTreeClassifier(min_samples_leaf=1000,max_depth=5)
tree_model.fit(x,y)

In [None]:
TREE = tree_model.tree_
indexes = TREE.apply(x.astype(np.float32))
predicts_from_xgboost = model_xgb.predict(x)
predicts_from_xgboost = predicts_from_xgboost.reshape((-1,1))

In [None]:
log_reg_dict = {}
nodes = np.unique(indexes)
for node in tqdm_notebook(nodes):
    model = LogisticRegression()
#     model.fit(predicts_from_xgboost[indexes==node],y[indexes==node])
    model.fit(transform(predicts_from_xgboost[indexes==node]),y[indexes==node])

    log_reg_dict[node] = model

In [None]:
indexes_test = TREE.apply(x_test.astype(np.float32))
predicts_from_xgboost_test = model_xgb.predict(x_test)
predicts_from_xgboost_test = predicts_from_xgboost_test.reshape((-1,1))
predicts_calibrated = np.zeros_like(predicts_from_xgboost_test)

In [None]:
for node in tqdm_notebook(log_reg_dict.keys()):
#     predicts_calibrated[indexes_valid==node] = \
#     log_reg_dict[node].predict_proba(predicts_from_xgboost_valid[indexes_valid==node])[:,1].reshape((-1,1))
    predicts_calibrated[indexes_test==node] = log_reg_dict[node].\
        predict_proba(transform(predicts_from_xgboost_test[indexes_test==node]))[:,1].reshape((-1,1))
    

In [None]:
bins = 1000

predicted_bins,actual_counters,counters = give_ece_data(predicts_calibrated.reshape((-1)),bins,y_test)

plt.scatter(predicted_bins,actual_counters)

ece = 0
for i in range(bins):
    ece +=  counters[i]*np.abs((predicted_bins[i] - actual_counters[i]))
ece /= len(preds)
print(ece)

In [None]:
print(roc_auc_score(y_test,predicts_calibrated))
print(roc_auc_score(y_test,predicts_from_xgboost_test))
assert np.sum(predicts_from_xgboost_test) - np.sum(preds) == 0