In [None]:
# Author: Chongzheng Zhao
# Kaggle Competition - New York City Taxi Fare Prediction
# FINAL BEAT 1454 TEAMS!
# We team NYCTAXI located at 30 out of 1484 teams, Top 2%!
# Final Score 2.86770
# Competition Official Website: https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
# To see the leaderboard(competition result): https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/leaderboard
# Welcome to my Github profile: https://github.com/ChongzhengZhao/
# Welcome to my Kaggle Profile: https://www.kaggle.com/chongzhengzhao
# Welcome to my Linkedin Profile: https://www.linkedin.com/in/chongzhengzhao/
# Last updated: 30/11/2018

In [33]:
import numpy as np
import pandas as pd
import gc
import time
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import xgboost as xgb
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [34]:
train=pd.read_csv('train_model.csv')

In [35]:
train= train.sample(n=10000000)

In [36]:
train.shape

(10000000, 24)

In [37]:
test=pd.read_csv('test_model.csv')

In [38]:
feats = [f for f in train.columns if f not in ['key','pickup_datetime','fare_amount']]

In [None]:
folds = KFold(n_splits= 5, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()
dtest=xgb.DMatrix(test[feats])
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['fare_amount'])):
    dtrain = xgb.DMatrix(train[feats].iloc[train_idx],train['fare_amount'].iloc[train_idx])
    dvalid = xgb.DMatrix(train[feats].iloc[valid_idx], train['fare_amount'].iloc[valid_idx])
    valid_y=train['fare_amount'].iloc[valid_idx]

       # xgb
    params = {'eval_metric': 'rmse',
              'objective': 'reg:linear',
              'booster':'gbtree',
              #'tree_method': 'auto',
              'nthread' : 4,
              'eta' : 0.03,
              'max_leaves': 40,
              'max_depth' : 7,
              'max_bin': 300,
              'min_child_weight' : 4,
              'subsample' : 0.8,
              'colsample_bytree' : 0.9,
              'colsample_bylevel' : 0.9,
              'reg_alpha' : 0.1,
              'reg_lambda' : 0.1,
              'gamma':0}
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
    model=xgb.train(params, dtrain, 20000, watchlist, maximize=False, early_stopping_rounds = 500, verbose_eval=100)
    oof_preds[valid_idx] = model.predict(dvalid, ntree_limit=model.best_ntree_limit)
    sub_preds += model.predict(dtest,ntree_limit=model.best_ntree_limit) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d rmse : %.6f' % (n_fold + 1,mean_squared_error(valid_y, oof_preds[valid_idx]) ** .5)) 
    del model, dtrain, dvalid
    gc.collect()

print('Full rmse %.6f' % mean_squared_error(train['fare_amount'], oof_preds)**.5)
# Write submission file and plot feature importance
sub_df = test[['key']].copy()
sub_df['fare_amount'] = sub_preds
sub_df[['key', 'fare_amount']].to_csv('submission_xgb.csv', index= False)

[0]	train-rmse:14.1105	valid-rmse:14.1126
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[100]	train-rmse:4.01269	valid-rmse:3.98765
[200]	train-rmse:3.80415	valid-rmse:3.78697
[300]	train-rmse:3.72848	valid-rmse:3.72576
[400]	train-rmse:3.67386	valid-rmse:3.68594
[500]	train-rmse:3.63393	valid-rmse:3.65911
[600]	train-rmse:3.60041	valid-rmse:3.63905
[700]	train-rmse:3.57438	valid-rmse:3.62524
[800]	train-rmse:3.55158	valid-rmse:3.61502
[900]	train-rmse:3.52806	valid-rmse:3.60459
[1000]	train-rmse:3.50815	valid-rmse:3.59692
[1100]	train-rmse:3.49031	valid-rmse:3.59111
[1200]	train-rmse:3.47451	valid-rmse:3.58602
[1300]	train-rmse:3.46049	valid-rmse:3.58195
[1400]	train-rmse:3.44719	valid-rmse:3.57837
[1500]	train-rmse:3.43444	valid-rmse:3.57483
[1600]	train-rmse:3.4219	valid-rmse:3.57212
[1700]	train-rmse:3.40989	valid-rmse:3.5699
[1800]	train-rmse:3.3992	valid-rmse:3.56827
[1900]	train-r

In [None]:
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout
plt.show()

In [28]:
from bayes_opt import BayesianOptimization

In [29]:
feats = [f for f in train.columns if f not in ['key','pickup_datetime','fare_amount']]
X_train=train[feats]
y_train=train['fare_amount']
X_test=test[feats]
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [30]:
def xgb_evaluate(max_depth, subsample,gamma, colsample_bytree, max_leaves,max_bin, min_child_weight,colsample_bylevel,\
                reg_alpha,reg_lambda):
    params = {'eval_metric': 'rmse',\
              'objective': 'reg:linear',\
              'booster':'gbtree',\
              'max_depth': int(max_depth),\
              'subsample': subsample,\
              'eta': 0.03,\
              'gamma': gamma,\
              'colsample_bytree': colsample_bytree,\
             'max_leaves': int(max_leaves),\
              'max_bin':int(max_bin),\
              'min_child_weight':min_child_weight,\
              'colsample_bylevel':colsample_bylevel,\
              'reg_alpha':reg_alpha,\
              'reg_lambda':reg_lambda}
    cv_result = xgb.cv(params, dtrain, num_boost_round=200000, nfold=5)    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [32]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (-1, 10), \
                                             'gamma': (0, 1),\
                                             'subsample': (0.6, 1.0),\
                                            'max_leaves': (20, 200),\
                                            'colsample_bytree': (0.6, 1.0),\
                                            'reg_lambda': (0, 1),\
                                            'reg_alpha': (0, 1),\
                                            'max_bin':(180,500),\
                                            'colsample_bylevel':(0.6,1.0),\
                                            'min_child_weight': (3, 20)})
# Use the expected improvement acquisition function to handle negative numbers
xgb_bo.maximize(init_points=3, n_iter=5, acq='ei')



[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bylevel |   colsample_bytree |     gamma |   max_bin |   max_depth |   max_leaves |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 
    1 | 00m03s | [35m  -4.65932[0m | [32m             0.8530[0m | [32m            0.9978[0m | [32m   0.7577[0m | [32m 436.9648[0m | [32m     6.0524[0m | [32m     92.2609[0m | [32m            5.4707[0m | [32m     0.3350[0m | [32m      0.2484[0m | [32m     0.8506[0m | 
    2 | 00m02s |   -4.68456 |              0.8690 |             0.7498 |    0.3695 |  364.7349 |      5.8246 |      57.6579 |            15.0731 |      0.8086 |       0.0897 |      0.9718 | 
    3 | 00m02s |   -4.69847 |              0.8420 |             0.7843 |    0.4752 |  193.6908 |      4.8



[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bylevel |   colsample_bytree |     gamma |   max_bin |   max_depth |   max_leaves |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 




    4 | 00m06s |   -9.74844 |              0.9536 |             0.8915 |    0.0296 |  277.9087 |      0.3343 |      58.8304 |            11.2454 |      0.3916 |       0.7779 |      0.6061 | 




    5 | 00m04s |   -4.86705 |              0.8981 |             0.8628 |    0.1689 |  350.5171 |      2.8242 |      49.2359 |             7.2682 |      0.0952 |       0.9924 |      0.7445 | 




    6 | 00m04s |   -9.74865 |              0.8979 |             0.7157 |    0.5696 |  378.7936 |     -0.7659 |      98.9150 |            15.5226 |      0.8186 |       0.0473 |      0.7983 | 




    7 | 00m04s |   -5.29935 |              0.8385 |             0.8096 |    0.2500 |  231.9922 |      1.4518 |      59.6688 |             9.5293 |      0.7211 |       0.9625 |      0.7660 | 




    8 | 00m04s |   -9.74857 |              0.8946 |             0.8119 |    0.9381 |  457.8497 |      0.8163 |     139.3985 |             6.4370 |      0.1945 |       0.0880 |      0.9847 | 
