In [1]:
# Import all Libraries
'''
Problem statement: Predict total fare for each flight for carriers L1, L2 and L3 on a given date

'''
import os
import numpy as np
import pandas as pd
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn import metrics
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
# Standard plotly imports
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [2]:
train_folder = './training/'
test_folder = './test/'

In [3]:
service_index_df = pd.read_csv(os.path.join(train_folder,'service_index.csv'))
train_schedules_df = pd.read_csv(os.path.join(train_folder,'train_schedules.csv'))
train_fares_df = pd.read_csv(os.path.join(train_folder,'train_fares.csv'))

In [4]:
# service_index_df = pd.read_csv(os.path.join(train_folder,'service_index.csv'))
test_schedules_df = pd.read_csv(os.path.join(test_folder,'test_schedules.csv'))
test_fares_df = pd.read_csv(os.path.join(test_folder,'test_fares_data.csv'))

In [5]:
service_index_df = service_index_df[service_index_df['carrier'].isin(['L1', 'L2','L3'])]
train_schedules_df = train_schedules_df[train_schedules_df['carrier'].isin(['L1', 'L2','L3'])]
train_fares_df = train_fares_df[train_fares_df['carrier'].isin(['L1', 'L2','L3'])]


In [6]:
# test dataset
test_schedules_df = test_schedules_df[test_schedules_df['carrier'].isin(['L1', 'L2','L3'])]
test_fares_df = test_fares_df[test_fares_df['carrier'].isin(['L1', 'L2','L3'])]

In [7]:
service_index_df.drop('Unnamed: 0',1,inplace=True)
train_schedules_df.drop('Unnamed: 0',1,inplace=True)
train_fares_df.drop('Unnamed: 0',1,inplace=True)

In [8]:
#test dataset
test_schedules_df.drop('Unnamed: 0',1,inplace=True)
test_fares_df.drop('Unnamed: 0',1,inplace=True)

In [9]:
# convert mm/dd/yyyy to yyyy-mm-dd
test_fares_df['observation_date']=test_fares_df['observation_date'].apply(lambda x :datetime.strptime(x, '%m/%d/%Y').strftime('%Y-%m-%d')) 
test_fares_df['flt_departure_dt']=test_fares_df['flt_departure_dt'].apply(lambda x :datetime.strptime(x, '%m/%d/%Y').strftime('%Y-%m-%d')) 

In [86]:
train_fares_df.dtypes

origin              object
destination         object
carrier             object
flt_num              int64
flt_departure_dt    object
observation_date    object
total_fare           int64
origin_city         object
destination_city    object
dtype: object

In [87]:
train_fares_df.shape

(1493689, 9)

In [88]:
train_schedules_df.dtypes

carrier                     object
flt_num                      int64
origin                      object
destination                 object
flt_departure_dt            object
flt_departure_local_time    object
flt_arrival_local_time      object
flt_departure_gmt           object
flt_arrival_gmt             object
dtype: object

In [89]:
train_schedules_df.shape

(35374, 9)

In [91]:
service_index_df.dtypes

yr                int64
mo                int64
origin           object
destination      object
carrier          object
scaled_demand     int64
scaled_share      int64
dtype: object

In [90]:
service_index_df.shape

(11546, 7)

In [14]:
train_merged_df = pd.merge(train_fares_df,train_schedules_df, on=['flt_num','flt_departure_dt','carrier','origin','destination'])

In [15]:
#test dataset
test_merged_df = pd.merge(test_fares_df,test_schedules_df, on=['flt_num','flt_departure_dt','carrier','origin','destination'])

In [16]:
train_merged_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt
0,Airport4,Airport43,L1,5911,2018-01-03,2017-11-29,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
1,Airport4,Airport43,L1,5911,2018-01-03,2017-12-12,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
2,Airport4,Airport43,L1,5911,2018-01-03,2017-12-23,651,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
3,Airport4,Airport43,L1,5911,2018-01-03,2017-11-30,714,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
4,Airport4,Airport43,L1,5911,2018-01-03,2017-12-02,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0


In [17]:
train_merged_df['departure_yr']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[0]))
train_merged_df['departure_mo']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[1]))
train_merged_df['departure_dt']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[2]))

In [18]:
train_merged_df['yr']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[0]))
train_merged_df['mo']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[1]))
train_merged_df['dt']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[2]))

In [19]:
# # test dataset

test_merged_df['departure_yr']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[0]))
test_merged_df['departure_mo']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[1]))
test_merged_df['departure_dt']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[2]))


test_merged_df['yr']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[0]))
test_merged_df['mo']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[1]))
test_merged_df['dt']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[2]))

In [21]:
train_df = pd.merge(train_merged_df, service_index_df, on=['yr','mo','carrier','origin','destination'])

In [22]:
test_df = pd.merge(test_merged_df, service_index_df, on=['yr','mo','carrier','origin','destination'])

In [23]:
train_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share
0,Airport4,Airport43,L1,5911,2018-01-03,2017-12-12,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,12,1649,322
1,Airport4,Airport43,L1,5911,2018-01-03,2017-12-23,651,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,23,1649,322
2,Airport4,Airport43,L1,5911,2018-01-03,2017-12-02,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,2,1649,322
3,Airport4,Airport43,L1,5911,2018-01-03,2017-12-06,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,6,1649,322
4,Airport4,Airport43,L1,5911,2018-01-03,2017-12-03,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,3,1649,322


In [24]:
test_merged_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt
0,Airport4,Airport43,L2,7465,2019-01-01,2018-11-27,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,11,27
1,Airport4,Airport43,L2,7465,2019-01-01,2018-11-28,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,11,28
2,Airport4,Airport43,L2,7465,2019-01-01,2018-12-07,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,7
3,Airport4,Airport43,L2,7465,2019-01-01,2018-12-05,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,5
4,Airport4,Airport43,L2,7465,2019-01-01,2018-12-01,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,1


In [25]:
train_df['flt_departure_dt']=train_df['flt_departure_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

train_df['observation_date']=train_df['observation_date'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

train_df['flt_departure_local_time']=train_df['flt_departure_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

train_df['flt_arrival_local_time']=train_df['flt_arrival_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

train_df['flt_departure_gmt']=train_df['flt_departure_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                
train_df['flt_arrival_gmt']=train_df['flt_arrival_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                  

In [26]:
# test dataset

test_df['flt_departure_dt']=test_df['flt_departure_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

test_df['observation_date']=test_df['observation_date'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

test_df['flt_departure_local_time']=test_df['flt_departure_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

test_df['flt_arrival_local_time']=test_df['flt_arrival_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

test_df['flt_departure_gmt']=test_df['flt_departure_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                
test_df['flt_arrival_gmt']=test_df['flt_arrival_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                  

In [27]:
train_df['flight_travel_duration'] = train_df['flt_arrival_gmt']-train_df['flt_departure_gmt']

In [28]:
train_df['flight_travel_duration'] = train_df['flight_travel_duration'].apply(lambda x: x.seconds/3600)

In [29]:
test_df['flight_travel_duration'] = test_df['flt_arrival_gmt']-test_df['flt_departure_gmt']
test_df['flight_travel_duration'] = test_df['flight_travel_duration'].apply(lambda x: x.seconds/3600)

In [33]:
train_df.tail()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration
1492090,Airport43,Airport4,L2,7092,2018-12-16,2018-12-04,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,4,1618,306,2.766667
1492091,Airport43,Airport4,L2,7092,2018-12-16,2018-12-08,1202,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,8,1618,306,2.766667
1492092,Airport43,Airport4,L2,7092,2018-12-16,2018-12-06,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,6,1618,306,2.766667
1492093,Airport43,Airport4,L2,7092,2018-12-16,2018-12-05,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,5,1618,306,2.766667
1492094,Airport43,Airport4,L2,7092,2018-12-16,2018-12-07,1202,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,7,1618,306,2.766667


In [129]:
len(train_df.columns)

27

In [38]:
train_df['flt_departure_dayofyear'] = train_df.flt_departure_dt.dt.dayofyear
train_df['flt_departure_weekday'] = train_df.flt_departure_dt.dt.weekday
train_df['flt_departure_week'] = train_df.flt_departure_dt.dt.week
train_df['flt_departure_quarter'] = train_df.flt_departure_dt.dt.quarter

train_df['observation_date_dayofyear'] = train_df.observation_date.dt.dayofyear
train_df['observation_date_weekday'] = train_df.observation_date.dt.weekday
train_df['observation_date_quarter'] = train_df.observation_date.dt.quarter
train_df['observation_date_week'] = train_df.observation_date.dt.week

In [40]:
test_df['flt_departure_dayofyear'] = test_df.flt_departure_dt.dt.dayofyear
test_df['flt_departure_weekday'] = test_df.flt_departure_dt.dt.weekday
test_df['flt_departure_week'] = test_df.flt_departure_dt.dt.week
test_df['flt_departure_quarter'] = test_df.flt_departure_dt.dt.quarter

test_df['observation_date_dayofyear'] = test_df.observation_date.dt.dayofyear
test_df['observation_date_weekday'] = test_df.observation_date.dt.weekday
test_df['observation_date_week'] = test_df.observation_date.dt.week
test_df['observation_date_quarter'] = test_df.observation_date.dt.quarter

In [41]:
def get_part_of_day(hour):
    return (
    "morning" if 5 <= hour <= 11
    else
    "afternoon" if 12 <= hour <= 17
    else
    "evening" if 18 <= hour <= 22
    else
    "night"
        )

In [42]:
train_df['_flt_departure_local_time'] = train_df.flt_departure_local_time.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_arrival_local_time'] = train_df.flt_arrival_local_time.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_departure_gmt'] = train_df.flt_departure_gmt.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_arrival_gmt'] = train_df.flt_arrival_gmt.apply(lambda x: get_part_of_day(x.hour))

In [43]:
# test dataset

test_df['_flt_departure_local_time'] = test_df.flt_departure_local_time.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_arrival_local_time'] = test_df.flt_arrival_local_time.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_departure_gmt'] = test_df.flt_departure_gmt.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_arrival_gmt'] = test_df.flt_arrival_gmt.apply(lambda x: get_part_of_day(x.hour))

In [44]:
cols_to_drop = ['flt_departure_dt','observation_date','flt_departure_local_time','flt_arrival_local_time',
                'flt_departure_gmt','flt_arrival_gmt']

In [45]:
[train_df.drop(col,1,inplace=True) for col in cols_to_drop]

[None, None, None, None, None, None]

In [46]:
[test_df.drop(col,1,inplace=True) for col in cols_to_drop]

[None, None, None, None, None, None]

In [78]:
train_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,origin_city,destination_city,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration,flt_departure_dayofyear,flt_departure_weekday,flt_departure_week,flt_departure_quarter,observation_date_dayofyear,observation_date_weekday,observation_date_week,observation_date_quarter,_flt_departure_local_time,_flt_arrival_local_time,_flt_departure_gmt,_flt_arrival_gmt
0,Airport60,Airport30,L1,7629,City56,City27,2018,4,14,2018,3,22,6755,280,1.6,104,5,15,2,81,3,12,1,evening,night,morning,morning
1,Airport26,Airport30,L1,6759,City24,City27,2018,10,15,2018,9,19,2545,388,3.166667,288,0,42,4,262,2,38,3,afternoon,afternoon,evening,night
2,Airport31,Airport30,L2,8903,City28,City27,2018,12,30,2018,11,19,8086,221,1.216667,364,6,52,4,323,0,47,4,evening,night,morning,morning
3,Airport30,Airport60,L1,5330,City27,City56,2018,5,19,2018,5,5,7255,301,1.633333,139,5,20,2,125,5,18,2,morning,afternoon,evening,evening
4,Airport4,Airport20,L1,5889,City4,City19,2018,4,27,2018,4,1,2084,365,3.616667,117,4,17,2,91,6,13,2,afternoon,afternoon,afternoon,evening


In [121]:
train_df.flt_departure_dayofyear.value_counts().iplot(kind='bar',xTitle='Day of the Year',yTitle='Flight Frequency',title='Flight Frequency over the year')

In [122]:
train_df.flt_departure_week.value_counts().iplot(kind='bar',xTitle='week of the Year',yTitle='Flight Frequency',title='Flight Frequency over the year')

In [96]:
train_df.origin.value_counts().iplot(kind='bar',xTitle='Origin Airports', yTitle='Count',title='Frequency Distribution of Origin Airports')

In [95]:
train_df.destination.value_counts().iplot(kind='bar')

In [104]:
train_df.flt_num.value_counts()[:10]

18349    19586
15278    17672
5236     17104
14581    17060
5927     16447
1396     16359
3120     16076
2297     16074
2837     15506
2724     15153
Name: flt_num, dtype: int64

In [105]:
train_df.flt_num.value_counts()[:10].iplot(kind='bar',xTitle='Flight Number', yTitle='Count',title='Frequency Distribution of Top-10 Flight Numbers')

In [51]:
train_df =train_df.sample(frac=1).reset_index(drop=True)

In [52]:
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [53]:
target = train_df['total_fare']
train_df.drop('total_fare',1,inplace=True)

In [134]:
len(X.columns)

60

In [54]:
X = pd.get_dummies(train_df, prefix_sep='_', drop_first=True)

In [55]:
X.head()

Unnamed: 0,flt_num,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration,flt_departure_dayofyear,flt_departure_weekday,flt_departure_week,flt_departure_quarter,observation_date_dayofyear,observation_date_weekday,observation_date_week,observation_date_quarter,origin_Airport20,origin_Airport26,origin_Airport30,origin_Airport31,origin_Airport4,origin_Airport43,origin_Airport60,destination_Airport20,destination_Airport26,destination_Airport30,destination_Airport31,destination_Airport4,destination_Airport43,destination_Airport60,carrier_L2,carrier_L3,origin_city_City19,origin_city_City24,origin_city_City27,origin_city_City28,origin_city_City39,origin_city_City4,origin_city_City56,destination_city_City19,destination_city_City24,destination_city_City27,destination_city_City28,destination_city_City39,destination_city_City4,destination_city_City56,_flt_departure_local_time_evening,_flt_departure_local_time_morning,_flt_departure_local_time_night,_flt_arrival_local_time_evening,_flt_arrival_local_time_morning,_flt_arrival_local_time_night,_flt_departure_gmt_evening,_flt_departure_gmt_morning,_flt_departure_gmt_night,_flt_arrival_gmt_evening,_flt_arrival_gmt_morning,_flt_arrival_gmt_night
0,7629,2018,4,14,2018,3,22,6755,280,1.6,104,5,15,2,81,3,12,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0
1,6759,2018,10,15,2018,9,19,2545,388,3.166667,288,0,42,4,262,2,38,3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,8903,2018,12,30,2018,11,19,8086,221,1.216667,364,6,52,4,323,0,47,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0
3,5330,2018,5,19,2018,5,5,7255,301,1.633333,139,5,20,2,125,5,18,2,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0
4,5889,2018,4,27,2018,4,1,2084,365,3.616667,117,4,17,2,91,6,13,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [56]:
X_ = pd.get_dummies(test_df, prefix_sep='_', drop_first=True)

In [57]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [58]:
# trainX, testX, trainy, testy = train_test_split(train_df, target, test_size=0.1, random_state=42)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.1, random_state=42)

In [70]:
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [67]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 50, 100],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [68]:
# Fit the grid search to the data
grid_search_result = grid_search.fit(X, target)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [69]:
rf = RandomForestRegressor(bootstrap=True,max_depth=10,n_estimators= 200,n_jobs=-1)

In [71]:
y_pred=rf.predict(X_test)

In [60]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [72]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 226.63628488469635
Mean Squared Error: 133886.1453108795
Root Mean Squared Error: 365.9045576525107


## Evaluate gridsearch model

In [62]:
gridsearch = evaluate(rf,X_test,y_test)

Model Performance
Average Error: 73.9413 degrees.
Accuracy = 90.92%.


##  Evaluate default model

In [73]:
base_model = RandomForestRegressor(n_estimators = 100, random_state = 42)
base_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [138]:
!pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m
[31mERROR: No matching distribution found for pickle[0m
You should consider upgrading via the '/Users/subir/pythonenv/default/bin/python3 -m pip install --upgrade pip' command.[0m


In [140]:
import _pickle as cPickle


with open('randomforest_basemodel.pkl', 'wb') as f:
    cPickle.dump(base_model, f)


In [75]:
y_pred=base_model.predict(X_test)

In [123]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 54.305118158433196
Mean Squared Error: 25137.295538932667
Root Mean Squared Error: 158.54745516384887


In [141]:
import lightgbm as lgb
lgb_model = lgb.Booster(model_file='light_gbm.txt')
lgb_pred = lgb_model.predict(X_test)

In [142]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, lgb_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, lgb_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lgb_pred)))

Mean Absolute Error: 105.43088835414349
Mean Squared Error: 27593.566681149423
Root Mean Squared Error: 166.1131141155009


In [149]:
y_joint = lgb_pred *0.5+ y_pred*0.5

In [150]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_joint))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_joint))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_joint)))

Mean Absolute Error: 72.3935681065824
Mean Squared Error: 20147.442044263644
Root Mean Squared Error: 141.94168536502463


In [None]:
## catboost
'''
Mean Absolute Error: 81.4103043710028
Mean Squared Error: 28821.818370900706
Root Mean Squared Error: 169.76989830620948
'''

## rf

'''
Mean Absolute Error: 54.305118158433196
Mean Squared Error: 25137.295538932667
Root Mean Squared Error: 158.54745516384887

'''


## lgb

'''
Mean Absolute Error: 78.85253053135875
Mean Squared Error: 26549.951343445744
Root Mean Squared Error: 162.94155806130536
'''