In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import pickle
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

# Import data and take a look at it

In [2]:
sample_df = pd.read_csv("train.csv")

In [3]:
sample_df.shape

(1458644, 11)

In [4]:
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Data Preprocessing

In [5]:
sample_df["store_and_fwd_flag"].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [6]:
#Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))            

In [7]:
#Check result
sample_df["store_and_fwd_flag"].value_counts()

0    1450599
1       8045
Name: store_and_fwd_flag, dtype: int64

## Engineer features

In [8]:
#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [9]:
#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

In [10]:
#Get latitude and longitude differences 
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [11]:
#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60))

In [12]:
sample_df.head(1)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525


In [13]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [14]:
sample_df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485


## Add weather

In [15]:
weather_df = pd.read_csv("KNYC_Metars.csv")

In [16]:
weather_df.head()

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions
0,2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast
1,2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast
2,2015-12-31 04:00:00,7.2,,,0.9,1016.7,5.6,12.9,Calm,0.0,0.0,0.0,,Overcast
3,2015-12-31 05:00:00,7.2,5.9,,0.86,1015.9,5.0,14.5,NW,7.4,0.0,0.0,,Overcast
4,2015-12-31 06:00:00,7.2,6.4,,0.9,1016.2,5.6,11.3,West,5.6,0.0,0.0,,Overcast


In [17]:
weather_df["Time"] = pd.to_datetime(weather_df["Time"])
weather_df["pickup_year"] = weather_df["Time"].dt.year
weather_df["pickup_month"] = weather_df["Time"].dt.month
weather_df["pickup_day"] = weather_df["Time"].dt.day
weather_df["pickup_hour"] = weather_df["Time"].dt.hour

In [18]:
weather_df.head(2)

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions,pickup_year,pickup_month,pickup_day,pickup_hour
0,2015-12-31 02:00:00,7.8,7.1,,0.89,1017.0,6.1,8.0,NNE,5.6,0.0,0.8,,Overcast,2015,12,31,2
1,2015-12-31 03:00:00,7.2,5.9,,0.9,1016.5,5.6,12.9,Variable,7.4,0.0,0.3,,Overcast,2015,12,31,3


In [19]:
#Since I am looking at year 2016, I want to keep everything in this year, do this check just in case.
weather_df = weather_df[weather_df["pickup_year"] == 2016]
weather_df.head(2)

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions,pickup_year,pickup_month,pickup_day,pickup_hour
22,2016-01-01 00:00:00,5.6,3.2,,0.58,1018.8,-2.2,16.1,WNW,11.1,0.0,0.0,,Overcast,2016,1,1,0
23,2016-01-01 01:00:00,5.6,4.0,,0.53,1018.5,-3.3,16.1,Variable,7.4,0.0,0.0,,Overcast,2016,1,1,1


In [20]:
#Merge weather data with my dataframe
sample_df = pd.merge(sample_df, weather_df[["Temp.", "pickup_month", "pickup_day", "pickup_hour", "Windchill", 
                                            "Humidity", "Pressure", "Dew Point", "Visibility", "Wind Dir", 
                                            "Wind Speed", "Gust Speed", "Precip", "Conditions"]], 
                                             how = "left", on = ["pickup_month", "pickup_day", "pickup_hour"])

In [21]:
sample_df.head(3)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance,Temp.,Windchill,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Conditions
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146,4.4,-0.5,0.86,1017.5,2.2,8.0,ENE,27.8,57.4,0.3,Overcast
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944,28.9,,0.53,1006.6,18.3,16.1,West,7.4,0.0,0.0,Unknown
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397,-6.7,-14.3,0.46,1016.3,-16.7,16.1,West,24.1,46.3,0.0,Clear


In [22]:
#Look at weather conditions
sample_df["Conditions"].unique()

array(['Overcast', 'Unknown', 'Clear', 'Heavy Rain', nan, 'Haze',
       'Partly Cloudy', 'Mostly Cloudy', 'Light Rain', 'Light Snow',
       'Scattered Clouds', 'Snow', 'Rain', 'Heavy Snow',
       'Light Freezing Rain', 'Light Freezing Fog'], dtype=object)

In [23]:
#Codify weather conditions into buckets
sample_df["Conditions"] = sample_df["Conditions"].fillna('Unknown')

weather_dict = {'Overcast' : 0, 
                'Haze' : 0,
                'Partly Cloudy' : 0, 
                'Mostly Cloudy' : 0, 
                'Scattered Clouds' : 0, 
                'Light Freezing Fog' : 0,
                
                'Unknown' : 1,
                'Clear' : 2, 
                
                'Heavy Rain' : 3, 
                'Rain' : 3, 
                'Light Freezing Rain' : 3,
                'Light Rain' : 3, 
                
                'Heavy Snow' : 4,
                'Light Snow' : 4,
                'Snow' : 4}

In [24]:
#Transform the column
sample_df["Conditions"] = sample_df["Conditions"].apply(lambda x: weather_dict[x])            

In [25]:
#Look at wind directions
sample_df["Wind Dir"].unique()

array(['ENE', 'West', 'South', 'Variable', 'SW', 'Calm', 'North', 'WSW',
       'East', nan, 'WNW', 'NW', 'ESE', 'NE', 'SSW', 'SSE', 'SE', 'NNE',
       'NNW'], dtype=object)

In [26]:
#Codify wind directions
sample_df["Wind Dir"] = sample_df["Wind Dir"].fillna('Unknown')

wind_dir_dict = {'East' : 0,
                 'ENE' : 0, 
                 'ESE' : 0, 
                 
                 'West' : 1, 
                 'WSW' : 1,
                 'WNW' : 1,
                 
                 'South' : 2, 
                 'SSE' : 2,   
                 'SSW' : 2,
                 
                 'North' : 3, 
                 'NNE' : 3, 
                 'NNW' : 3,
                 
                 'Variable' : 4, 
                 'Calm' : 5, 
                 'SW' : 6, 
                 'NW' : 7, 
                 'NE' : 8, 
                 'SE' : 9, 
                 'Unknown' : 10
                }

In [27]:
#And transform the column
sample_df["Wind Dir"] = sample_df["Wind Dir"].apply(lambda x: wind_dir_dict[x]) 

In [28]:
#Check results
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance,Temp.,Windchill,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Conditions
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146,4.4,-0.5,0.86,1017.5,2.2,8.0,0,27.8,57.4,0.3,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944,28.9,,0.53,1006.6,18.3,16.1,1,7.4,0.0,0.0,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397,-6.7,-14.3,0.46,1016.3,-16.7,16.1,1,24.1,46.3,0.0,2
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567,7.2,3.3,0.39,1019.1,-6.1,16.1,2,25.9,35.2,0.0,2
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485,9.4,,0.46,1026.9,-1.7,16.1,4,9.3,0.0,0.0,2


# Modeling

In [29]:
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = sample_df["trip_duration"]

In [30]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [31]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [32]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
    'fobj':               'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'verbose_eval' :      1,
    'feval':              'rmsle'
}
#"subsample" is the fraction of the training samples (randomly selected) that will be used to train each tree.
#"colsample_by_tree" is the fraction of features (randomly selected) that will be used to train each tree.
#"colsample_bylevel" is the fraction of features (randomly selected) that will be used in each node to train each tree.

In [33]:
nrounds = 2000

In [34]:
#Define train and validation sets
dtrain = lgb.Dataset(X_train, np.log(y_train+1))
dval = lgb.Dataset(X_val, np.log(y_val+1), reference=dtrain)

#this is for tracking the error
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [35]:
#Train model
bst = lgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                valid_sets = [dtrain, dval],
                valid_names = ['train', 'valid'],
                categorical_feature = [20, 24]
                )

New categorical_feature is [20, 24]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2800
[LightGBM] [Info] Number of data points in the train set: 765787, number of used features: 25




[LightGBM] [Info] Start training from score 2.487283
[1]	train's l2: 0.45848	valid's l2: 0.457053
[2]	train's l2: 0.431951	valid's l2: 0.430452
[3]	train's l2: 0.414278	valid's l2: 0.412738
[4]	train's l2: 0.392002	valid's l2: 0.390404
[5]	train's l2: 0.372356	valid's l2: 0.370711
[6]	train's l2: 0.353949	valid's l2: 0.352265
[7]	train's l2: 0.337534	valid's l2: 0.335796
[8]	train's l2: 0.322397	valid's l2: 0.320608
[9]	train's l2: 0.311757	valid's l2: 0.309925
[10]	train's l2: 0.298949	valid's l2: 0.297082
[11]	train's l2: 0.287346	valid's l2: 0.285435
[12]	train's l2: 0.27687	valid's l2: 0.274932
[13]	train's l2: 0.267206	valid's l2: 0.265255
[14]	train's l2: 0.260245	valid's l2: 0.258299
[15]	train's l2: 0.252149	valid's l2: 0.250179
[16]	train's l2: 0.246225	valid's l2: 0.244239
[17]	train's l2: 0.239407	valid's l2: 0.237405
[18]	train's l2: 0.233104	valid's l2: 0.231074
[19]	train's l2: 0.227366	valid's l2: 0.225305
[20]	train's l2: 0.221921	valid's l2: 0.219855
[21]	train's l2: 0

[178]	train's l2: 0.13202	valid's l2: 0.130302
[179]	train's l2: 0.131915	valid's l2: 0.130199
[180]	train's l2: 0.131755	valid's l2: 0.130038
[181]	train's l2: 0.131698	valid's l2: 0.13
[182]	train's l2: 0.131639	valid's l2: 0.129947
[183]	train's l2: 0.131546	valid's l2: 0.12986
[184]	train's l2: 0.131492	valid's l2: 0.129809
[185]	train's l2: 0.131416	valid's l2: 0.129738
[186]	train's l2: 0.131366	valid's l2: 0.12969
[187]	train's l2: 0.131295	valid's l2: 0.129621
[188]	train's l2: 0.13112	valid's l2: 0.129445
[189]	train's l2: 0.131062	valid's l2: 0.129392
[190]	train's l2: 0.13101	valid's l2: 0.129342
[191]	train's l2: 0.130917	valid's l2: 0.129249
[192]	train's l2: 0.130856	valid's l2: 0.129209
[193]	train's l2: 0.130812	valid's l2: 0.129168
[194]	train's l2: 0.130717	valid's l2: 0.129079
[195]	train's l2: 0.130652	valid's l2: 0.129014
[196]	train's l2: 0.130613	valid's l2: 0.128979
[197]	train's l2: 0.130553	valid's l2: 0.12892
[198]	train's l2: 0.1304	valid's l2: 0.128767
[199

[358]	train's l2: 0.123721	valid's l2: 0.123065
[359]	train's l2: 0.123698	valid's l2: 0.123052
[360]	train's l2: 0.12365	valid's l2: 0.123007
[361]	train's l2: 0.123631	valid's l2: 0.122987
[362]	train's l2: 0.123574	valid's l2: 0.122936
[363]	train's l2: 0.123549	valid's l2: 0.122936
[364]	train's l2: 0.123501	valid's l2: 0.122887
[365]	train's l2: 0.123489	valid's l2: 0.122883
[366]	train's l2: 0.123474	valid's l2: 0.122875
[367]	train's l2: 0.123452	valid's l2: 0.122853
[368]	train's l2: 0.123426	valid's l2: 0.122852
[369]	train's l2: 0.12341	valid's l2: 0.122842
[370]	train's l2: 0.123398	valid's l2: 0.122833
[371]	train's l2: 0.123382	valid's l2: 0.122829
[372]	train's l2: 0.123364	valid's l2: 0.122817
[373]	train's l2: 0.123345	valid's l2: 0.1228
[374]	train's l2: 0.123329	valid's l2: 0.122789
[375]	train's l2: 0.123306	valid's l2: 0.122785
[376]	train's l2: 0.123287	valid's l2: 0.122772
[377]	train's l2: 0.123266	valid's l2: 0.122759
[378]	train's l2: 0.123249	valid's l2: 0.122

[531]	train's l2: 0.120066	valid's l2: 0.120538
[532]	train's l2: 0.120041	valid's l2: 0.120523
[533]	train's l2: 0.120001	valid's l2: 0.120489
[534]	train's l2: 0.119992	valid's l2: 0.120489
[535]	train's l2: 0.119967	valid's l2: 0.120473
[536]	train's l2: 0.119958	valid's l2: 0.120466
[537]	train's l2: 0.119938	valid's l2: 0.120448
[538]	train's l2: 0.11991	valid's l2: 0.120426
[539]	train's l2: 0.119895	valid's l2: 0.120413
[540]	train's l2: 0.119875	valid's l2: 0.120402
[541]	train's l2: 0.119855	valid's l2: 0.120401
[542]	train's l2: 0.119842	valid's l2: 0.120394
[543]	train's l2: 0.119817	valid's l2: 0.120374
[544]	train's l2: 0.119792	valid's l2: 0.12035
[545]	train's l2: 0.119771	valid's l2: 0.120335
[546]	train's l2: 0.119747	valid's l2: 0.120317
[547]	train's l2: 0.119722	valid's l2: 0.120299
[548]	train's l2: 0.119711	valid's l2: 0.120297
[549]	train's l2: 0.119692	valid's l2: 0.120296
[550]	train's l2: 0.119647	valid's l2: 0.120253
[551]	train's l2: 0.119628	valid's l2: 0.1

[714]	train's l2: 0.117112	valid's l2: 0.118811
[715]	train's l2: 0.117099	valid's l2: 0.118805
[716]	train's l2: 0.117077	valid's l2: 0.118802
[717]	train's l2: 0.117064	valid's l2: 0.118794
[718]	train's l2: 0.117048	valid's l2: 0.118781
[719]	train's l2: 0.11704	valid's l2: 0.118777
[720]	train's l2: 0.11703	valid's l2: 0.118776
[721]	train's l2: 0.117004	valid's l2: 0.118751
[722]	train's l2: 0.116988	valid's l2: 0.118739
[723]	train's l2: 0.116976	valid's l2: 0.11873
[724]	train's l2: 0.116962	valid's l2: 0.118719
[725]	train's l2: 0.116952	valid's l2: 0.118719
[726]	train's l2: 0.11694	valid's l2: 0.118718
[727]	train's l2: 0.116933	valid's l2: 0.118715
[728]	train's l2: 0.116923	valid's l2: 0.118714
[729]	train's l2: 0.116903	valid's l2: 0.118696
[730]	train's l2: 0.116883	valid's l2: 0.118678
[731]	train's l2: 0.116875	valid's l2: 0.118678
[732]	train's l2: 0.116848	valid's l2: 0.118653
[733]	train's l2: 0.116835	valid's l2: 0.118643
[734]	train's l2: 0.11682	valid's l2: 0.1186

[898]	train's l2: 0.11467	valid's l2: 0.117509
[899]	train's l2: 0.114658	valid's l2: 0.117504
[900]	train's l2: 0.114634	valid's l2: 0.117485
[901]	train's l2: 0.114629	valid's l2: 0.117483
[902]	train's l2: 0.114619	valid's l2: 0.11748
[903]	train's l2: 0.114608	valid's l2: 0.117474
[904]	train's l2: 0.114595	valid's l2: 0.117461
[905]	train's l2: 0.114579	valid's l2: 0.117449
[906]	train's l2: 0.11457	valid's l2: 0.117448
[907]	train's l2: 0.114559	valid's l2: 0.117442
[908]	train's l2: 0.114549	valid's l2: 0.117435
[909]	train's l2: 0.114541	valid's l2: 0.117437
[910]	train's l2: 0.114515	valid's l2: 0.117417
[911]	train's l2: 0.11451	valid's l2: 0.117416
[912]	train's l2: 0.114501	valid's l2: 0.117409
[913]	train's l2: 0.114483	valid's l2: 0.117403
[914]	train's l2: 0.114473	valid's l2: 0.117399
[915]	train's l2: 0.114467	valid's l2: 0.117396
[916]	train's l2: 0.114454	valid's l2: 0.117385
[917]	train's l2: 0.114442	valid's l2: 0.117376
[918]	train's l2: 0.114429	valid's l2: 0.117

[1070]	train's l2: 0.112806	valid's l2: 0.116744
[1071]	train's l2: 0.112798	valid's l2: 0.116745
[1072]	train's l2: 0.112793	valid's l2: 0.116742
[1073]	train's l2: 0.112785	valid's l2: 0.116736
[1074]	train's l2: 0.112777	valid's l2: 0.116733
[1075]	train's l2: 0.112774	valid's l2: 0.116732
[1076]	train's l2: 0.112766	valid's l2: 0.116732
[1077]	train's l2: 0.112757	valid's l2: 0.116728
[1078]	train's l2: 0.112746	valid's l2: 0.116721
[1079]	train's l2: 0.112734	valid's l2: 0.116713
[1080]	train's l2: 0.112721	valid's l2: 0.116706
[1081]	train's l2: 0.112711	valid's l2: 0.116707
[1082]	train's l2: 0.112702	valid's l2: 0.1167
[1083]	train's l2: 0.112681	valid's l2: 0.116685
[1084]	train's l2: 0.112672	valid's l2: 0.116684
[1085]	train's l2: 0.112661	valid's l2: 0.116676
[1086]	train's l2: 0.112658	valid's l2: 0.116675
[1087]	train's l2: 0.112647	valid's l2: 0.116669
[1088]	train's l2: 0.112639	valid's l2: 0.116664
[1089]	train's l2: 0.112626	valid's l2: 0.116656
[1090]	train's l2: 0.1

[1251]	train's l2: 0.111027	valid's l2: 0.116041
[1252]	train's l2: 0.111011	valid's l2: 0.116042
[1253]	train's l2: 0.111001	valid's l2: 0.11604
[1254]	train's l2: 0.110997	valid's l2: 0.116038
[1255]	train's l2: 0.110975	valid's l2: 0.116017
[1256]	train's l2: 0.110964	valid's l2: 0.116009
[1257]	train's l2: 0.110954	valid's l2: 0.116011
[1258]	train's l2: 0.110946	valid's l2: 0.116005
[1259]	train's l2: 0.110941	valid's l2: 0.116004
[1260]	train's l2: 0.110934	valid's l2: 0.116001
[1261]	train's l2: 0.110926	valid's l2: 0.116002
[1262]	train's l2: 0.110919	valid's l2: 0.116002
[1263]	train's l2: 0.11091	valid's l2: 0.116002
[1264]	train's l2: 0.110903	valid's l2: 0.116
[1265]	train's l2: 0.110894	valid's l2: 0.115996
[1266]	train's l2: 0.110886	valid's l2: 0.115992
[1267]	train's l2: 0.110881	valid's l2: 0.11599
[1268]	train's l2: 0.11087	valid's l2: 0.115989
[1269]	train's l2: 0.110863	valid's l2: 0.115987
[1270]	train's l2: 0.110856	valid's l2: 0.115986
[1271]	train's l2: 0.110849

[1432]	train's l2: 0.109451	valid's l2: 0.115316
[1433]	train's l2: 0.10944	valid's l2: 0.115308
[1434]	train's l2: 0.109431	valid's l2: 0.115302
[1435]	train's l2: 0.109419	valid's l2: 0.115296
[1436]	train's l2: 0.109411	valid's l2: 0.115294
[1437]	train's l2: 0.109405	valid's l2: 0.115292
[1438]	train's l2: 0.109393	valid's l2: 0.115289
[1439]	train's l2: 0.109386	valid's l2: 0.115285
[1440]	train's l2: 0.109375	valid's l2: 0.115274
[1441]	train's l2: 0.109367	valid's l2: 0.115273
[1442]	train's l2: 0.10936	valid's l2: 0.115275
[1443]	train's l2: 0.109351	valid's l2: 0.115269
[1444]	train's l2: 0.10934	valid's l2: 0.11526
[1445]	train's l2: 0.109324	valid's l2: 0.115244
[1446]	train's l2: 0.109321	valid's l2: 0.115242
[1447]	train's l2: 0.109313	valid's l2: 0.115238
[1448]	train's l2: 0.109299	valid's l2: 0.115228
[1449]	train's l2: 0.109294	valid's l2: 0.115226
[1450]	train's l2: 0.109279	valid's l2: 0.115215
[1451]	train's l2: 0.10927	valid's l2: 0.11521
[1452]	train's l2: 0.10926

[1612]	train's l2: 0.108111	valid's l2: 0.114827
[1613]	train's l2: 0.108099	valid's l2: 0.114823
[1614]	train's l2: 0.108095	valid's l2: 0.114822
[1615]	train's l2: 0.108087	valid's l2: 0.114818
[1616]	train's l2: 0.108082	valid's l2: 0.114814
[1617]	train's l2: 0.108075	valid's l2: 0.11481
[1618]	train's l2: 0.10807	valid's l2: 0.114809
[1619]	train's l2: 0.108052	valid's l2: 0.114797
[1620]	train's l2: 0.108049	valid's l2: 0.114797
[1621]	train's l2: 0.108039	valid's l2: 0.11479
[1622]	train's l2: 0.108026	valid's l2: 0.114785
[1623]	train's l2: 0.108019	valid's l2: 0.114784
[1624]	train's l2: 0.108013	valid's l2: 0.11478
[1625]	train's l2: 0.107999	valid's l2: 0.11478
[1626]	train's l2: 0.10799	valid's l2: 0.114775
[1627]	train's l2: 0.107986	valid's l2: 0.114776
[1628]	train's l2: 0.107982	valid's l2: 0.114776
[1629]	train's l2: 0.107978	valid's l2: 0.114775
[1630]	train's l2: 0.107971	valid's l2: 0.114773
[1631]	train's l2: 0.107964	valid's l2: 0.114767
[1632]	train's l2: 0.10795

[1789]	train's l2: 0.106824	valid's l2: 0.114422
[1790]	train's l2: 0.106821	valid's l2: 0.114422
[1791]	train's l2: 0.106806	valid's l2: 0.11442
[1792]	train's l2: 0.106796	valid's l2: 0.114421
[1793]	train's l2: 0.10679	valid's l2: 0.114419
[1794]	train's l2: 0.106782	valid's l2: 0.114412
[1795]	train's l2: 0.106773	valid's l2: 0.114406
[1796]	train's l2: 0.106768	valid's l2: 0.114405
[1797]	train's l2: 0.106763	valid's l2: 0.114403
[1798]	train's l2: 0.10676	valid's l2: 0.114403
[1799]	train's l2: 0.106758	valid's l2: 0.114402
[1800]	train's l2: 0.106752	valid's l2: 0.114399
[1801]	train's l2: 0.10675	valid's l2: 0.114399
[1802]	train's l2: 0.106743	valid's l2: 0.114397
[1803]	train's l2: 0.10674	valid's l2: 0.114397
[1804]	train's l2: 0.106733	valid's l2: 0.114398
[1805]	train's l2: 0.106721	valid's l2: 0.114398
[1806]	train's l2: 0.10671	valid's l2: 0.114391
[1807]	train's l2: 0.106701	valid's l2: 0.114383
[1808]	train's l2: 0.106694	valid's l2: 0.114383
[1809]	train's l2: 0.10668

[1960]	train's l2: 0.105751	valid's l2: 0.114174
[1961]	train's l2: 0.105742	valid's l2: 0.114169
[1962]	train's l2: 0.105733	valid's l2: 0.114165
[1963]	train's l2: 0.105728	valid's l2: 0.114166
[1964]	train's l2: 0.105718	valid's l2: 0.114165
[1965]	train's l2: 0.105713	valid's l2: 0.114164
[1966]	train's l2: 0.105707	valid's l2: 0.114161
[1967]	train's l2: 0.105701	valid's l2: 0.114159
[1968]	train's l2: 0.105698	valid's l2: 0.114157
[1969]	train's l2: 0.105695	valid's l2: 0.114156
[1970]	train's l2: 0.105689	valid's l2: 0.114155
[1971]	train's l2: 0.105678	valid's l2: 0.114146
[1972]	train's l2: 0.105672	valid's l2: 0.114142
[1973]	train's l2: 0.10567	valid's l2: 0.114142
[1974]	train's l2: 0.105666	valid's l2: 0.114143
[1975]	train's l2: 0.105655	valid's l2: 0.114134
[1976]	train's l2: 0.10565	valid's l2: 0.114133
[1977]	train's l2: 0.105644	valid's l2: 0.114131
[1978]	train's l2: 0.10564	valid's l2: 0.114129
[1979]	train's l2: 0.105633	valid's l2: 0.114124
[1980]	train's l2: 0.10

In [36]:
#Test predictions
pred = np.exp(bst.predict(X_test)) - 1

In [37]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

4.9521537796449415

In [38]:
filename = "lightGBM_model.sav"
pickle.dump(bst, open(filename, 'wb'))