In [2]:
%load_ext autoreload
%autoreload 2
from lib.model import train_lgbm_fold_classif, plot_importances, train_lgbm_fold_reg
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from lib.constants import DATA_FOLDER, TMP_FOLDER, SUBMISSION_FOLDER
from sklearn.preprocessing import StandardScaler
from lib.dataload import load_data
import numpy as np
from tqdm import tqdm
from lib.utils import make_submission_from_hdf
df_train, df_target, df_test = load_data(read=True, reduce_mem=False)
df_train['is_train'] = 1
df_test['is_train'] = 0
train_test = pd.concat([df_train, df_test], sort=False)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


... Reading ...
-- Done


# Model

In [3]:
print(train_test.shape)

scaler = StandardScaler()

remove_cols = ['target', 'ID_code', 'is_train', 'key_0', 'strat', 'oof_preds_ref', 'oof_preds_ref_error']
selected_cols = list(set(train_test.columns) - set(remove_cols))

train_test[selected_cols] = scaler.fit_transform(train_test[selected_cols])/3
"""
binned_cols = []
for col in tqdm(selected_cols):
    newcol = 'binned_' + col
    binned_cols.append(newcol)
    train_test[newcol] = pd.cut(train_test[col], 50, labels=[x for x in range(0, 50)])
    tmp = pd.cut(train_test[col], 50, labels=[x for x in range(0, 50)]).value_counts().sort_values().reset_index().reset_index()
    tmp = tmp[['level_0', 'index']].rename(columns={'index':'binned_' + col, 'level_0':'rerank_' + col})
    train_test = train_test.merge(tmp, on=['binned_' + col])

"""
# columnwise features

train_test['sum_col'] = train_test[selected_cols].sum(axis=1)
train_test['min_col'] = train_test[selected_cols].min(axis=1)
train_test['max_col'] = train_test[selected_cols].max(axis=1)
train_test['std_col'] = train_test[selected_cols].std(axis=1)
train_test['var_col'] = train_test[selected_cols].var(axis=1)
train_test['mean_col'] = train_test[selected_cols].mean(axis=1)
train_test['median_col'] = train_test[selected_cols].median(axis=1)
train_test['spread_col'] = abs(train_test['max_col'] - train_test['min_col'])
print(train_test.shape)

"""
# modulo stuff
modulo_cols = []
list_modulos = [0.05, 0.1, 0.2]
for col in tqdm(selected_cols):
    for modulo in list_modulos:
        new_col = col + '_modulo_' + str(modulo)
        modulo_cols.append(new_col)
        train_test[new_col] = train_test[col] % modulo
"""
"""
binned_cols = []
for col in selected_cols:
    newcol = 'binned_' + col
    binned_cols.append(newcol)
    train_test[newcol] = pd.qcut(train_test[col], 10, labels=[x for x in range(0, 10)])
"""
top_features = [
    'var_6', 'var_34', 'var_53', 'var_22',
    'var_174', 'var_99', 'var_12', 'var_81',
    'var_68', 'var_12', 'var_108', 'var_126',
    'var_33', 'var_139']


# try a groupby
def perform_agg_dict(data, agg_dict_ref, groupcol):
    agg_dict = {}
    for col in agg_dict_ref.keys():
        agg_dict[col] = {}
        for aggfunc in agg_dict_ref[col]:
            if isinstance(aggfunc, str):
                func_name = aggfunc
            else:
                func_name = aggfunc.__name__
            agg_dict[col][col + '_' + "-".join(groupcol) + '_' + func_name] = aggfunc

    tmp = data.groupby(groupcol).agg(agg_dict)
    tmp.columns = tmp.columns.droplevel()
    tmp = tmp.reset_index()
    return tmp
agg_dict_ref_col = {}
for col in top_features:
    agg_dict_ref_col[col] = ['min', 'max', 'median', 'mean', 'var', 'std']
for groupcol in tqdm(top_features):
    tmp = perform_agg_dict(train_test, agg_dict_ref_col, groupcol)
    tmp = tmp.fillna(0)
    train_test = train_test.merge(tmp, on=groupcol)
print(train_test.shape)
"""

from sklearn.preprocessing import PolynomialFeatures
# Create PolynomialFeatures object with interaction_only set to True
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)


#top_features = ['var_6', 'var_34', 'var_53', 'var_22', 'var_174']
# Transform feature matrix

poly_interactions = pd.DataFrame(interaction.fit_transform(train_test[top_features]),
                                 index=train_test.index,
                                 columns=interaction.get_feature_names()
                                )
poly_interactions['ID_code'] = train_test['ID_code']

train_test = (
    train_test.merge(poly_interactions[['ID_code'] + interaction.get_feature_names()[len(top_features):]],
                     on=['ID_code'])
)
print(train_test.shape)
for feat in top_features:
    for other_feat in top_features:
        if feat != other_feat:
            intername = feat + '_-_' + other_feat
            train_test[intername] = train_test[feat] - train_test[other_feat]
            intername = feat + '_+_' + other_feat
            train_test[intername] = train_test[feat] + train_test[other_feat]

print(train_test.shape)
"""
#train_test = train_test.fillna(0)

print(train_test.shape)

(400000, 203)


  0%|          | 0/14 [00:00<?, ?it/s]

(400000, 211)


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
100%|██████████| 14/14 [03:09<00:00, 23.24s/it]

(400000, 1303)
(400000, 1303)





In [None]:
for col in binned_cols:
    train_test[col] = train_test[col].astype('int')

In [4]:
print("- Resplit train/test")
train = train_test[train_test['is_train'] == 1]
test = train_test[train_test['is_train'] == 0]
train.to_hdf('./data_tmp/df_train_fe.hdf', 'df')
test.to_hdf('./data_tmp/df_test_fe.hdf', 'df')
pd.DataFrame(train['target']).to_hdf('./data_tmp/df_target_fe2.hdf', 'df')
#oof_preds_ref = pd.read_hdf('./data_tmp/oof_lgbm_classif_CV_0.89904_TR_0.91199.hdf', key='df')
#train = train.merge(oof_preds_ref, on=train.index).rename(columns={0: 'oof_preds_ref'})
#train['oof_preds_ref_error'] = abs(train['oof_preds_ref'] - train['target'])
#train['oof_preds_ref'] = np.round(train['oof_preds_ref'])
#train['strat'] = ((train['target'] == 1) | (train['oof_preds_ref_error'] > 0.6)).astype('int')
#train[train['strat'] == 1].shape

- Resplit train/test


In [5]:
# Code
#keep_that = np.array(train[train['oof_preds_ref_error'] > 0.99].index)
test['oof_preds_ref'] = 0
remove_cols = ['target', 'ID_code', 'is_train', 'key_0', 'strat', 'oof_preds_ref', 'oof_preds_ref_error']
features = list(set(train.columns) - set(remove_cols))

        
importances, df_oof_preds, df_preds, filename, models = train_lgbm_fold_classif(
    train, test, features, train['target'],
    train['target'],
    #keep_index=keep_that,
    repeat_cv=1, n_splits=4,
    n_max_estimators=10000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


== INIT ==
== START MODEL TRAIN
== REPEAT CV 0
==== CV 0
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.756083	valid_1's auc: 0.75034
[200]	training's auc: 0.782589	valid_1's auc: 0.774733
[300]	training's auc: 0.800296	valid_1's auc: 0.791501
[400]	training's auc: 0.813047	valid_1's auc: 0.803303
[500]	training's auc: 0.822911	valid_1's auc: 0.81273
[600]	training's auc: 0.831508	valid_1's auc: 0.821033
[700]	training's auc: 0.83865	valid_1's auc: 0.828084
[800]	training's auc: 0.844349	valid_1's auc: 0.833544
[900]	training's auc: 0.849654	valid_1's auc: 0.838478
[1000]	training's auc: 0.854006	valid_1's auc: 0.842567
[1100]	training's auc: 0.858078	valid_1's auc: 0.846562
[1200]	training's auc: 0.86152	valid_1's auc: 0.849939
[1300]	training's auc: 0.864636	valid_1's auc: 0.852979
[1400]	training's auc: 0.867604	valid_1's auc: 0.855691
[1500]	training's auc: 0.870613	valid_1's auc: 0.858326
[1600]	training's auc: 0.872866	valid_1's auc: 0.8604

[200]	training's auc: 0.780745	valid_1's auc: 0.780658
[300]	training's auc: 0.798987	valid_1's auc: 0.797775
[400]	training's auc: 0.811616	valid_1's auc: 0.809874
[500]	training's auc: 0.82197	valid_1's auc: 0.819829
[600]	training's auc: 0.83023	valid_1's auc: 0.827689
[700]	training's auc: 0.837101	valid_1's auc: 0.834535
[800]	training's auc: 0.843077	valid_1's auc: 0.840452
[900]	training's auc: 0.84838	valid_1's auc: 0.845477
[1000]	training's auc: 0.852783	valid_1's auc: 0.849613
[1100]	training's auc: 0.856562	valid_1's auc: 0.853475
[1200]	training's auc: 0.860054	valid_1's auc: 0.856311
[1300]	training's auc: 0.863131	valid_1's auc: 0.859169
[1400]	training's auc: 0.86622	valid_1's auc: 0.861749
[1500]	training's auc: 0.868698	valid_1's auc: 0.864086
[1600]	training's auc: 0.871094	valid_1's auc: 0.866128
[1700]	training's auc: 0.873464	valid_1's auc: 0.868282
[1800]	training's auc: 0.875541	valid_1's auc: 0.870139
[1900]	training's auc: 0.877478	valid_1's auc: 0.871773
[200

KeyboardInterrupt: 

In [None]:
plot_importances(importances, num_features=20)

In [None]:
top_features = list(importances.groupby('feature')['gain'].mean().sort_values(ascending=False).head(10).index)
print(top_features)

In [None]:
top_features = ['var_53', 'var_6', 'var_174', 'var_139', 'var_33', 'var_76', 'var_21', 'var_12', 'var_22', 'var_34']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_test['var_68'])
print(vectorizer.get_feature_names())

# explore error

In [None]:
df_oof_preds.index = train.index
train['oof_preds'] = df_oof_preds
train['oof_preds'].isna().sum()

In [None]:
train['abs_error'] = abs(train['target'] - train['oof_preds'])
train['error'] = train['target'] - train['oof_preds']
train['abs_error'].hist(bins=100)

In [None]:
sns.distplot(train[train['target'] == 1]['abs_error'])
sns.distplot(train[train['target'] == 0]['abs_error'])

In [None]:
train[train['target'] == 1].sort_values(by='abs_error', ascending=False).head(5)

In [None]:
train[train['target'] == 0].sort_values(by='abs_error', ascending=False).head(5)

In [None]:
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(
    train[features].values,
    feature_names=list(train.columns),
    class_names=['0', '1'],
    training_labels=train['target']
)

In [None]:
def model(models, data):
    #feats = data.shape[0]
    #data = np.reshape(data, (1, feats))
    pred=np.zeros(data.shape[0])
    n=len(models)
    for i_model in models:
        pred+=i_model.predict(data)/n
    return pred

def prob(data):
    pred = model(models, data)
    probas = np.array(list(zip(1-pred, pred)))
    return probas

In [None]:
i = 187082 # False prediction
print('target:', train['target'].iloc[i])
exp = explainer.explain_instance(train[features].iloc[i].values, prob)
exp.show_in_notebook()

In [None]:
i = 58770 # False prediction
print('target:', train['target'].iloc[i])
exp = explainer.explain_instance(train[features].iloc[i].values, prob)
exp.show_in_notebook()

In [None]:
i = 90698 # Good prediction
print('target:', train['target'].iloc[i])
exp = explainer.explain_instance(train[features].iloc[i].values, prob)
exp.show_in_notebook()

In [None]:
i = 79322 # Good prediction
print('target:', train['target'].iloc[i])
exp = explainer.explain_instance(train[features].iloc[i].values, prob)
exp.show_in_notebook()

In [None]:
train[train['target']==0].sort_values(by='oof_preds_ref_error', ascending=False).head()

In [None]:
import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(models[1])
shap_values = explainer.shap_values(train[features].values)

In [None]:
i = 187082 # False prediction
shap.force_plot(explainer.expected_value, shap_values[i,:], train[features].iloc[i,:])

In [None]:
shap.summary_plot(shap_values, train[features].values)

In [None]:
shap.dependence_plot('var_6', shap_values, train[features])

In [None]:
shap.dependence_plot('var_133', shap_values, train[features])

# Explore errors

In [None]:
df_oof_preds.isna().sum()

In [None]:
df_oof_preds.index = train.index
train['oof_preds'] = df_oof_preds
train['oof_preds'].isna().sum()

In [None]:
from sklearn.metrics import roc_auc_score
train['oof_preds'] = df_oof_preds
train[['target', 'oof_preds']].head()

In [None]:
train['abs_error'] = abs(train['target'] - train['oof_preds'])
train['error'] = train['target'] - train['oof_preds']
train['abs_error'].hist(bins=100)

In [None]:
sns.distplot(train[train['target'] == 1]['abs_error'])
sns.distplot(train[train['target'] == 0]['abs_error'])

In [None]:
train[train['target'] == 1].sort_values(by='abs_error', ascending=False).head(5)

In [None]:
train[train['ID_code'] == idcode][features].values.ravel().shape

In [None]:
[train['ID_code'] == idcode]

In [None]:
idcode = 'train_187082' # False prediction
print('target:', train[train['ID_code'] == idcode]['target'].values)
exp = explainer.explain_instance(train[train['ID_code'] == idcode][features].values.ravel(), prob)
exp.show_in_notebook()

In [None]:
train[train['abs_error'] > 0.7]['target'].describe()

In [None]:
tr = 0.7
print("base: 0.8981078597740635")
roc_auc_score(train[train['error'] < tr]['target'].values, (train[train['error'] < tr]['oof_preds'].values))

In [None]:
train[train['error'] > tr]['target'].shape

In [None]:
train['high_error'] = (train['error'] > tr).astype('int')
train['high_error'].describe()

In [None]:
# Code
remove_cols = ['target', 'ID_code', 'high_error', 'error', 'oof_preds']
features = list(set(train.columns) - set(remove_cols))

importances, df_oof_preds, df_preds, filename = train_lgbm_fold_classif(
                              train,
                              train,
    train['high_error'],
                              features, train['high_error'],
                              repeat_cv=1, n_splits=4,
                              n_max_estimators=10000
                              )

In [None]:
plot_importances(importances, num_features=10)

In [None]:
%load_ext autoreload
%autoreload 2
from lib.model import train_lgbm_fold_reg

remove_cols = ['target', 'ID_code', 'high_error', 'error', 'oof_preds', 'abs_error']
features = list(set(train.columns) - set(remove_cols))

importances, df_oof_preds, df_preds = train_lgbm_fold_reg(
                              train,
                              test,
                              features, train['error'],
                              repeat_cv=1, n_splits=4,
                              n_max_estimators=10000
                              )

In [None]:
df_preds.head()

In [None]:
plot_importances(importances, num_features=20)

In [None]:
# Code
tr=0.7
print(train[train['error'] > tr].shape)

subset = pd.DataFrame(train[train['error'] <= tr])

remove_cols = ['target', 'ID_code', 'high_error', 'error', 'oof_preds']
features = list(set(train.columns) - set(remove_cols))

importances, df_oof_preds, df_preds, filename = train_lgbm_fold_classif(
                              subset,
                              test,
                              features, subset['target'],
                              repeat_cv=1, n_splits=4,
                              n_max_estimators=10000
                              )

In [None]:
REPEAT CV: 0 CV SCORE: 0.9751631824628406 TR SCORE 0.9993788178275096

In [None]:
plot_importances(importances, num_features=10)

In [None]:
corrs = train.corr(method ='pearson') 

In [None]:
corrs[['target', 'oof_preds', 'error']].sort_values(by=['error'], ascending=False).head(20)

In [None]:
def tune(x):
    if x <= 0.5:
        x = 0
    if x > 0.5:
        x = 1
    return x

preds_try = pd.DataFrame(df_oof_preds)
preds_try['oof_error_pred'] = df_preds
preds_try['preds_1'] = np.clip(preds_try[0], 0, 1)
#preds_try['preds_1'] = np.round(preds_try[0])
#preds_try['preds_1'] = preds_try[0].apply(lambda x: tune(x))
print("base: 0.8981078597740635")
roc_auc_score(train['target'].values, (preds_try['preds_1'].values))

In [None]:
preds_try = pd.DataFrame(df_oof_preds)
preds_try.head()

# Submissions

In [None]:
test['ID_code'].reset_index().head()

In [None]:
df_pred_file = df_preds.merge(test['ID_code'].reset_index(), on=df_preds.index)[['ID_code', 0]].rename(columns={0:'target'})
df_pred_file.to_csv('./data_tmp/test_pred.csv', index=False)

In [None]:
test['ID_code'].head()

In [None]:
df_train.shape

In [None]:
train.shape

In [None]:
filename = 'preds_lgbm_classif_CV_0.88763_TR_0.98783'
filename = 'preds_lgbm_classif_CV_0.89800_TR_0.91159'


In [None]:
filename

In [None]:
make_submission_from_hdf('preds_lgbm_classif_CV_0.89810_TR_0.91029', test['ID_code'])

In [None]:
preds_lgbm_classif_CV_0.89810_TR_0.91029.hdf