In [1]:
### Import required libraries

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

In [12]:
# Read train and test files
train_df = pd.read_csv('all/train.csv')
test_df = pd.read_csv('all/test.csv')

In [13]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [15]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


In [16]:
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 0


In [17]:
# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

In [37]:
train_df.shape, test_df.shape

((4459, 4737), (49342, 4736))

In [40]:
%%time
dupl = train_df.columns.duplicated()
train = train_df.loc[:, ~dupl]
test = test_df.loc[:, ~(dupl[1:])]

CPU times: user 642 ms, sys: 818 ms, total: 1.46 s
Wall time: 1.61 s


In [19]:
%%time
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

train_df, test_df = drop_sparse(train_df, test_df)

CPU times: user 598 ms, sys: 3.48 ms, total: 601 ms
Wall time: 602 ms


In [41]:
gc.collect()
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))

Train set size: (4459, 4737)
Test set size: (49342, 4736)


In [42]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)

dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [71]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      #nfold=5, stratified = False, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result
                  )
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result
    #return model

In [68]:
#pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
pred_test, model = run_lgb(dev_X, dev_y, val_X, val_y, X_test)

print("LightGBM Training Completed...")

[150]	cv_agg's rmse: 1.56216 + 0.0173564
[300]	cv_agg's rmse: 1.48469 + 0.0192992
[450]	cv_agg's rmse: 1.45514 + 0.0194907
[600]	cv_agg's rmse: 1.44596 + 0.0211616
[750]	cv_agg's rmse: 1.44392 + 0.0224452
LightGBM Training Completed...


In [64]:
x = model.predict(X_test, num_iteration=model.best_iteration)

In [72]:
model = run_lgb(dev_X, dev_y, val_X, val_y, X_test)

[150]	cv_agg's rmse: 1.56216 + 0.0173564
[300]	cv_agg's rmse: 1.48469 + 0.0192992
[450]	cv_agg's rmse: 1.45514 + 0.0194907
[600]	cv_agg's rmse: 1.44596 + 0.0211616
[750]	cv_agg's rmse: 1.44392 + 0.0224452


In [74]:
params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }

In [76]:
lgtrain = lgb.Dataset(dev_X, label=dev_y)
lgval = lgb.Dataset(val_X, label=val_y)
evals_result = {}
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'regression',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'metric' : 'rmse'}

# Create parameters to search
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40, 60, 80, 100],
    'num_leaves': [4, 6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['regression'],
    'colsample_bytree' : [0.65, 0.66, 0.7],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

mdl = lgb.LGBMRegressor(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = -1,
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'])

In [78]:
# sklearn tools for model training and assesment
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    cv=5,
                    n_jobs=-1)

In [79]:
grid.fit(dev_X, y=dev_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_bin=512, max_depth=-1, min_child_samples=5,
       min_child_weight=1, min_split_gain=0.5, n_estimators=100, n_jobs=-1,
       num_leaves=31, objective='binary', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1, subsample_for_bin=200,
       subsample_freq=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'colsample_bytree': [0.65, 0.66, 0.7], 'subsample': [0.7, 0.75], 'reg_lambda': [1, 1.2, 1.4], 'objective': ['regression'], 'num_leaves': [4, 6, 8, 12, 16], 'reg_alpha': [1, 1.2], 'n_estimators': [40, 60, 80, 100], 'learning_rate': [0.005], 'boosting_type': ['gbdt']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [87]:
#https://github.com/Microsoft/LightGBM/issues/1339
#https://www.kaggle.com/garethjns/microsoft-lightgbm-with-parameter-tuning-0-823
grid.best_params_

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.7,
 'learning_rate': 0.005,
 'n_estimators': 100,
 'num_leaves': 16,
 'objective': 'regression',
 'reg_alpha': 1,
 'reg_lambda': 1,
 'subsample': 0.75}

In [92]:
mdl_cv = lgb.cv(params, lgtrain, nfold=5, stratified=False)

In [111]:
from bayes_opt import BayesianOptimization

def lgb_eval(n_estimators, num_leaves, colsample_bytree, subsample, min_child_weight):
    params = {'application':'regression', 'learning_rate':0.05, 'early_stopping_round':100, 'metric':'rmse'}
    params["n_estimators"] = int(round(n_estimators))
    params["num_leaves"] = int(round(num_leaves))
    params["colsample_bytree"] = max(min(colsample_bytree, 1), 0)
    params["subsample"] = max(min(subsample, 1), 0)
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, , nfold=5, seed=0, stratified=False, verbose_eval =200, metrics=['rmse'])
    return max(cv_result['rmse'])


lgbBO = BayesianOptimization(lgb_eval, {
    'n_estimators': (80, 120),
    'num_leaves': (12, 20),
    'colsample_bytree' : (0.68,0.75),
    'subsample' : (0.7,0.8),
    'min_child_weight': (5, 50)
    }, random_state=0)

In [112]:
lgbBO.maximize(init_points=5, n_iter=25)
# https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm
# problem in function to optimize

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   min_child_weight |   n_estimators |   num_leaves |   subsample | 


NameError: name 'train_data' is not defined

In [80]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [81]:
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")

[0]	train-rmse:14.0877	valid-rmse:14.0769
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7685	valid-rmse:12.7564
[200]	train-rmse:11.5766	valid-rmse:11.5632
[300]	train-rmse:10.4999	valid-rmse:10.4853
[400]	train-rmse:9.52768	valid-rmse:9.51285
[500]	train-rmse:8.65058	valid-rmse:8.6359
[600]	train-rmse:7.85824	valid-rmse:7.84463
[700]	train-rmse:7.14349	valid-rmse:7.13189
[800]	train-rmse:6.49876	valid-rmse:6.48996
[900]	train-rmse:5.91707	valid-rmse:5.91197
[1000]	train-rmse:5.39237	valid-rmse:5.39154
[1100]	train-rmse:4.91949	valid-rmse:4.92438
[1200]	train-rmse:4.49361	valid-rmse:4.50471
[1300]	train-rmse:4.10984	valid-rmse:4.12821
[1400]	train-rmse:3.76504	valid-rmse:3.79227
[1500]	train-rmse:3.45482	valid-rmse:3.49131
[1600]	train-rmse:3.17624	valid-rmse:3.22403
[1700]	train-rmse:2.92645	valid-rmse:2.98643
[1800]	train-rmse:2.70319	valid-rmse:2.77623
[1900]	train

In [82]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [83]:
cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=True)

0:	learn: 13.8849206	test: 13.8784128	best: 13.8784128 (0)	total: 2s	remaining: 16m 39s
50:	learn: 2.0342737	test: 2.0171864	best: 2.0171864 (50)	total: 1m 38s	remaining: 14m 26s
100:	learn: 1.6029678	test: 1.6073706	best: 1.6073706 (100)	total: 3m 16s	remaining: 12m 57s
150:	learn: 1.5250946	test: 1.5581991	best: 1.5581991 (150)	total: 4m 57s	remaining: 11m 28s
200:	learn: 1.4736060	test: 1.5357500	best: 1.5357500 (200)	total: 6m 36s	remaining: 9m 50s
250:	learn: 1.3910444	test: 1.5095707	best: 1.5095707 (250)	total: 8m 12s	remaining: 8m 8s
300:	learn: 1.3377256	test: 1.4954333	best: 1.4954333 (300)	total: 9m 47s	remaining: 6m 28s
350:	learn: 1.3047366	test: 1.4877374	best: 1.4877374 (350)	total: 11m 22s	remaining: 4m 49s
400:	learn: 1.2748954	test: 1.4846130	best: 1.4846130 (400)	total: 12m 57s	remaining: 3m 11s
450:	learn: 1.2454797	test: 1.4808275	best: 1.4808275 (450)	total: 14m 33s	remaining: 1m 34s
499:	learn: 1.2163238	test: 1.4753264	best: 1.4753264 (499)	total: 16m 8s	remaini

<catboost.core.CatBoostRegressor at 0x1a1fd31ba8>

In [84]:
pred_test_cat = np.expm1(cb_model.predict(X_test))

In [None]:
d = {'col1': [1, 2,3], 'col2': [3, 4]}
df = pd.DataFrame(data=d)