In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.preprocessing import PolynomialFeatures

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
train_X = pd.read_csv('./intermediate/train.csv') 
train_y = pd.read_csv('./intermediate/target.csv', header=-1)
test_X = pd.read_csv('./intermediate/test.csv')
ids = pd.read_csv('./intermediate/sample_submission.csv')['card_id']

In [34]:
train_X['outliers'] = 0
outliers_index = train_y < -30
outliers_index = outliers_index.values.ravel()
train_X.loc[outliers_index, 'outliers'] = 1

In [37]:
df_train_columns = [c for c in train_X.columns if c not in ['outliers']]

In [39]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(train_X))
predictions = np.zeros(len(test_X))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X, train_X['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_X.iloc[trn_idx][df_train_columns], label=train_y.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_X.iloc[val_idx][df_train_columns], label=train_y.iloc[val_idx])#, categorical_feature=categorical_feats)
    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(train_X.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_X[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, train_y))

fold 0
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.66825	valid_1's rmse: 3.72318
[200]	training's rmse: 3.59316	valid_1's rmse: 3.69056
[300]	training's rmse: 3.54789	valid_1's rmse: 3.67764
[400]	training's rmse: 3.51206	valid_1's rmse: 3.6708
[500]	training's rmse: 3.48366	valid_1's rmse: 3.66614
[600]	training's rmse: 3.45888	valid_1's rmse: 3.66292
[700]	training's rmse: 3.43548	valid_1's rmse: 3.66061
[800]	training's rmse: 3.41509	valid_1's rmse: 3.65962
[900]	training's rmse: 3.39755	valid_1's rmse: 3.65849
[1000]	training's rmse: 3.37988	valid_1's rmse: 3.65757
[1100]	training's rmse: 3.36357	valid_1's rmse: 3.65719
[1200]	training's rmse: 3.34757	valid_1's rmse: 3.65616
[1300]	training's rmse: 3.33177	valid_1's rmse: 3.656
Early stopping, best iteration is:
[1286]	training's rmse: 3.33412	valid_1's rmse: 3.6557
fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.66902	valid_1's rmse: 3.71659

3.6549059010562677

In [40]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="Feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [41]:
sub_df = pd.DataFrame({"card_id": ids.values})
sub_df["target"] = predictions
sub_df.to_csv("./output/lightgbm.csv", index=False)