In [1]:
%load_ext autoreload
%autoreload 2
from lib.model import train_lgbm_fold_classif, plot_importances
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from lib.constants import DATA_FOLDER, TMP_FOLDER, SUBMISSION_FOLDER
from lib.dataload import load_data
import numpy as np
from lib.utils import make_submission_from_hdf
df_train, df_target, df_test = load_data(read=True, reduce_mem=False)
df_train['is_train'] = 1
df_test['is_train'] = 0
train_test = pd.concat([df_train, df_test], sort=False)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


... Reading ...
-- Done


In [3]:
nmods_data = []
for param in train_test.columns:
    nmods = train_test[param].nunique()
    nmods_data.append(nmods)

df_nmods = pd.DataFrame(list(train_test.columns), columns=['param'])
df_nmods['nmods'] = nmods_data

sel_cols = list(set(df_nmods.columns) - set(['is_train', 'target']))

look_cols = list(df_nmods[sel_cols].sort_values('nmods').head(30)['param'].values[2:])
df_nmods.sort_values('nmods').head(10)

Unnamed: 0,param,nmods
202,is_train,2
1,target,2
70,var_68,461
93,var_91,8197
110,var_108,8651
105,var_103,9634
14,var_12,9737
150,var_148,10894
163,var_161,11359
73,var_71,13968


In [None]:
count_cols = []
for col in look_cols:
    colname = 'count_' + col
    count_cols.append(colname)
    tmp = (
        pd.DataFrame(train_test.groupby([col])['ID_code']
                     .count().reset_index()
                     .rename(columns={'ID_code': colname}))
    )
    train_test = train_test.merge(tmp, on=[col])

In [None]:
for col in count_cols:
    df_a = pd.cut(train_test[col], train_test[col].nunique()).value_counts().sort_index()
    df_a = df_a.reset_index().rename(columns={'index': 'bin'}).reset_index()
    df_a['diff'] = df_a[col].diff().fillna(0)
    df_a['delta_0'] = np.around((df_a[col] - 0 )/ (df_a['index'] + 1), decimals=0)
    
    step = (train_test[col].max()-train_test[col].min())/train_test[col].nunique()
    tmp2 = pd.DataFrame(train_test[col].drop_duplicates()).sort_values(by=col)
    group_col = 'group_' + col
    tmp2[group_col] = tmp2[col].apply(lambda x: int(df_a.iloc[np.minimum(int(x/step),
                                                  train_test[col].nunique()-1), 4]))
    train_test = train_test.merge(tmp2, on=col)

In [None]:
remove_cols = ['target', 'ID_code', 'is_train']
selected_cols = list(set(df_train.columns) - set(remove_cols))
print(selected_cols)

In [None]:
plt.plot(df_train.iloc[0, 2:-1])

In [None]:
plt.plot(np.arange(0, 200), df_train.iloc[10, 2:-1])

In [None]:
plt.plot(np.arange(0, 200), df_train.iloc[100, 2:-1].sort_values())

In [None]:
from scipy.interpolate import spline, CubicSpline
from scipy.interpolate import UnivariateSpline
xk = spline(np.arange(0, 200).astype('float'),
       df_train.iloc[10, 2:-1].sort_values().values.astype('float'),
       np.arange(0, 200).astype('float')
      )
plt.plot(np.arange(0, 200), xk)
plt.plot(np.arange(0, 200), df_train.iloc[10, 2:-1].sort_values())

In [None]:
cs = CubicSpline(np.arange(0, 200).astype('float'), df_train.iloc[10, 2:-1].sort_values().values.astype('float'))
plt.plot(np.arange(0, 200), cs(np.arange(0, 200)))

In [None]:
data_dict = {}
data_dict['integral'] = []
data_dict['residual'] = []
data_dict['left_derivative'] = []
data_dict['center_derivative'] = []
data_dict['right_derivative'] = []

lent = len(train_test)
sub_train_test = train_test[selected_cols]
for index, row in enumerate(range(lent)):
    us = UnivariateSpline(np.arange(0, 200).astype('float'),
                      sub_train_test.iloc[row, :].sort_values().values.astype('float'),
                      k=5
                     )
    data_dict['integral'].append(us.integral(0, 199))
    data_dict['residual'].append(us.get_residual())
    data_dict['left_derivative'].append(us.derivatives(10)[1])
    data_dict['center_derivative'].append(us.derivatives(100)[1])
    data_dict['right_derivative'].append(us.derivatives(189)[1])
    if index % 5000 == 0:
        print(index/lent*100)

In [None]:
tmp = train_test.merge(pd.DataFrame(data_dict), on=train_test.index)
tmp.shape

In [None]:
tmp.head()

In [None]:
pd.DataFrame(data_dict).head()

In [None]:
# columnwise features
train_test['min_col'] = train_test[selected_cols].min(axis=1)
train_test['max_col'] = train_test[selected_cols].max(axis=1)
train_test['std_col'] = train_test[selected_cols].std(axis=1)
train_test['var_col'] = train_test[selected_cols].var(axis=1)
train_test['mean_col'] = train_test[selected_cols].mean(axis=1)
train_test['median_col'] = train_test[selected_cols].median(axis=1)

In [None]:
print("- Resplit train/test")
train = train_test[train_test['is_train'] == 1]
test = train_test[train_test['is_train'] == 0]

# PCA stuff

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
remove_cols = ['target', 'ID_code', 'is_train']
selected_cols = list(set(df_train.columns) - set(remove_cols))
pca_result = pca.fit_transform(train_test[selected_cols])

print(pca.explained_variance_ratio_)  
print(pca.singular_values_)  

print("Total explained variance:", np.sum(pca.explained_variance_ratio_))

plt.scatter(x=pca_result[:, 0], y=pca_result[:, 1], marker='.')

In [None]:
train_test['pca_0_0'] = np.around(pca_result[:, 0], decimals=0)
train_test['pca_0_1'] = np.around(pca_result[:, 1], decimals=0)

In [None]:
train_test['pca_0_0'].hist(bins=100)

In [None]:
sns.jointplot(pca_result[:, 0], pca_result[:, 1], kind="hex", color="#4CB391")

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics


X = pca_result

# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor=tuple(col), markersize=1)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor=tuple(col), markersize=1)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

# Model

In [None]:
print("- Resplit train/test")
train_test = tmp
train = train_test[train_test['is_train'] == 1]
test = train_test[train_test['is_train'] == 0]

In [None]:
train.to_hdf('./data_tmp/new_train.hdf', 'df')
test.to_hdf('./data_tmp/new_test.hdf', 'df')

In [None]:
# Code
remove_cols = ['target', 'ID_code']
features = list(set(train.columns) - set(remove_cols) - set(selected_cols))

importances, df_oof_preds, df_preds, filename = train_lgbm_fold_classif(train, test, features, train['target'],
                              repeat_cv=1, n_splits=4,
                              n_max_estimators=10000
                              )

In [None]:
plot_importances(importances, num_features=200)

# Submissions

In [None]:
test['ID_code'].reset_index().head()

In [None]:
df_pred_file = df_preds.merge(test['ID_code'].reset_index(), on=df_preds.index)[['ID_code', 0]].rename(columns={0:'target'})
df_pred_file.to_csv('./data_tmp/test_pred.csv', index=False)

In [None]:
filename = 'preds_lgbm_classif_CV_0.88763_TR_0.98783'
filename = 'preds_lgbm_classif_CV_0.89800_TR_0.91159'


In [None]:
filename

In [None]:
make_submission_from_hdf('preds_lgbm_classif_CV_0.89810_TR_0.91029', test['ID_code'])

In [None]:
preds_lgbm_classif_CV_0.89810_TR_0.91029.hdf