In [1]:
# Import all Libraries
'''
Problem statement: Predict total fare for each flight for carriers L1, L2 and L3 on a given date

'''
import os
import numpy as np
import pandas as pd
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.model_selection import train_test_split
from sklearn import metrics
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
# Standard plotly imports
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [65]:
train_folder = './training/'
test_folder = './test/'

In [66]:
service_index_df = pd.read_csv(os.path.join(train_folder,'service_index.csv'))
train_schedules_df = pd.read_csv(os.path.join(train_folder,'train_schedules.csv'))
train_fares_df = pd.read_csv(os.path.join(train_folder,'train_fares.csv'))

In [67]:
# service_index_df = pd.read_csv(os.path.join(train_folder,'service_index.csv'))
test_schedules_df = pd.read_csv(os.path.join(test_folder,'test_schedules.csv'))
test_fares_df = pd.read_csv(os.path.join(test_folder,'test_fares_data.csv'))

In [68]:
test_fares_df_org = test_fares_df.copy()

In [69]:
service_index_df = service_index_df[service_index_df['carrier'].isin(['L1', 'L2','L3'])]
train_schedules_df = train_schedules_df[train_schedules_df['carrier'].isin(['L1', 'L2','L3'])]
train_fares_df = train_fares_df[train_fares_df['carrier'].isin(['L1', 'L2','L3'])]


In [70]:
# test dataset
test_schedules_df = test_schedules_df[test_schedules_df['carrier'].isin(['L1', 'L2','L3'])]
test_fares_df = test_fares_df[test_fares_df['carrier'].isin(['L1', 'L2','L3'])]

In [71]:
service_index_df.drop('Unnamed: 0',1,inplace=True)
train_schedules_df.drop('Unnamed: 0',1,inplace=True)
train_fares_df.drop('Unnamed: 0',1,inplace=True)

In [72]:
#test dataset
test_schedules_df.drop('Unnamed: 0',1,inplace=True)
test_fares_df.drop('Unnamed: 0',1,inplace=True)

In [73]:
# convert mm/dd/yyyy to yyyy-mm-dd
test_fares_df['observation_date']=test_fares_df['observation_date'].apply(lambda x :datetime.strptime(x, '%m/%d/%Y').strftime('%Y-%m-%d')) 
test_fares_df['flt_departure_dt']=test_fares_df['flt_departure_dt'].apply(lambda x :datetime.strptime(x, '%m/%d/%Y').strftime('%Y-%m-%d')) 

In [12]:
train_fares_df.dtypes

origin              object
destination         object
carrier             object
flt_num              int64
flt_departure_dt    object
observation_date    object
total_fare           int64
origin_city         object
destination_city    object
dtype: object

In [13]:
train_fares_df.shape

(1493689, 9)

In [14]:
train_schedules_df.dtypes

carrier                     object
flt_num                      int64
origin                      object
destination                 object
flt_departure_dt            object
flt_departure_local_time    object
flt_arrival_local_time      object
flt_departure_gmt           object
flt_arrival_gmt             object
dtype: object

In [15]:
train_schedules_df.shape

(35374, 9)

In [16]:
service_index_df.dtypes

yr                int64
mo                int64
origin           object
destination      object
carrier          object
scaled_demand     int64
scaled_share      int64
dtype: object

In [17]:
service_index_df.shape

(11546, 7)

In [74]:
train_merged_df = pd.merge(train_fares_df,train_schedules_df, on=['flt_num','flt_departure_dt','carrier','origin','destination'])

In [75]:
#test dataset
test_merged_df = pd.merge(test_fares_df,test_schedules_df, on=['flt_num','flt_departure_dt','carrier','origin','destination'])

In [76]:
train_merged_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt
0,Airport4,Airport43,L1,5911,2018-01-03,2017-11-29,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
1,Airport4,Airport43,L1,5911,2018-01-03,2017-12-12,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
2,Airport4,Airport43,L1,5911,2018-01-03,2017-12-23,651,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
3,Airport4,Airport43,L1,5911,2018-01-03,2017-11-30,714,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0
4,Airport4,Airport43,L1,5911,2018-01-03,2017-12-02,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0


In [21]:
train_merged_df['departure_yr']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[0]))
train_merged_df['departure_mo']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[1]))
train_merged_df['departure_dt']=train_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[2]))

In [22]:
train_merged_df['yr']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[0]))
train_merged_df['mo']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[1]))
train_merged_df['dt']=train_merged_df['observation_date'].apply(lambda x: int(x.split('-')[2]))

In [23]:
# # test dataset

test_merged_df['departure_yr']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[0]))
test_merged_df['departure_mo']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[1]))
test_merged_df['departure_dt']=test_merged_df['flt_departure_dt'].apply(lambda x: int(x.split('-')[2]))


test_merged_df['yr']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[0]))
test_merged_df['mo']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[1]))
test_merged_df['dt']=test_merged_df['observation_date'].apply(lambda x: int(x.split('-')[2]))

In [24]:
train_df = pd.merge(train_merged_df, service_index_df, on=['yr','mo','carrier','origin','destination'])

In [25]:
test_df = pd.merge(test_merged_df, service_index_df, on=['yr','mo','carrier','origin','destination'])

In [26]:
train_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share
0,Airport4,Airport43,L1,5911,2018-01-03,2017-12-12,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,12,1649,322
1,Airport4,Airport43,L1,5911,2018-01-03,2017-12-23,651,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,23,1649,322
2,Airport4,Airport43,L1,5911,2018-01-03,2017-12-02,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,2,1649,322
3,Airport4,Airport43,L1,5911,2018-01-03,2017-12-06,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,6,1649,322
4,Airport4,Airport43,L1,5911,2018-01-03,2017-12-03,538,City4,City39,2018-01-03 17:00:00.0,2018-01-03 19:37:00.0,2018-01-03 23:00:00.0,2018-01-04 01:37:00.0,2018,1,3,2017,12,3,1649,322


In [27]:
test_merged_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt
0,Airport4,Airport43,L2,7465,2019-01-01,2018-11-27,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,11,27
1,Airport4,Airport43,L2,7465,2019-01-01,2018-11-28,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,11,28
2,Airport4,Airport43,L2,7465,2019-01-01,2018-12-07,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,7
3,Airport4,Airport43,L2,7465,2019-01-01,2018-12-05,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,5
4,Airport4,Airport43,L2,7465,2019-01-01,2018-12-01,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0,2019,1,1,2018,12,1


In [28]:
train_df['flt_departure_dt']=train_df['flt_departure_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

train_df['observation_date']=train_df['observation_date'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

train_df['flt_departure_local_time']=train_df['flt_departure_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

train_df['flt_arrival_local_time']=train_df['flt_arrival_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

train_df['flt_departure_gmt']=train_df['flt_departure_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                
train_df['flt_arrival_gmt']=train_df['flt_arrival_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                  

In [29]:
# test dataset

test_df['flt_departure_dt']=test_df['flt_departure_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

test_df['observation_date']=test_df['observation_date'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

test_df['flt_departure_local_time']=test_df['flt_departure_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

test_df['flt_arrival_local_time']=test_df['flt_arrival_local_time'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

test_df['flt_departure_gmt']=test_df['flt_departure_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                
test_df['flt_arrival_gmt']=test_df['flt_arrival_gmt'].apply(lambda x :datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
                                                                                  

In [30]:
train_df['flight_travel_duration'] = train_df['flt_arrival_gmt']-train_df['flt_departure_gmt']

In [31]:
train_df['flight_travel_duration'] = train_df['flight_travel_duration'].apply(lambda x: x.seconds/3600)

In [32]:
test_df['flight_travel_duration'] = test_df['flt_arrival_gmt']-test_df['flt_departure_gmt']
test_df['flight_travel_duration'] = test_df['flight_travel_duration'].apply(lambda x: x.seconds/3600)

In [33]:
train_df.tail()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration
1492090,Airport43,Airport4,L2,7092,2018-12-16,2018-12-04,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,4,1618,306,2.766667
1492091,Airport43,Airport4,L2,7092,2018-12-16,2018-12-08,1202,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,8,1618,306,2.766667
1492092,Airport43,Airport4,L2,7092,2018-12-16,2018-12-06,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,6,1618,306,2.766667
1492093,Airport43,Airport4,L2,7092,2018-12-16,2018-12-05,844,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,5,1618,306,2.766667
1492094,Airport43,Airport4,L2,7092,2018-12-16,2018-12-07,1202,City39,City4,2018-12-16 20:25:00,2018-12-16 23:11:00,2018-12-17 02:25:00,2018-12-17 05:11:00,2018,12,16,2018,12,7,1618,306,2.766667


In [34]:
len(train_df.columns)

22

In [35]:
train_df['flt_departure_dayofyear'] = train_df.flt_departure_dt.dt.dayofyear
train_df['flt_departure_weekday'] = train_df.flt_departure_dt.dt.weekday
train_df['flt_departure_week'] = train_df.flt_departure_dt.dt.week
train_df['flt_departure_quarter'] = train_df.flt_departure_dt.dt.quarter

train_df['observation_date_dayofyear'] = train_df.observation_date.dt.dayofyear
train_df['observation_date_weekday'] = train_df.observation_date.dt.weekday
train_df['observation_date_quarter'] = train_df.observation_date.dt.quarter
train_df['observation_date_week'] = train_df.observation_date.dt.week

In [36]:
test_df['flt_departure_dayofyear'] = test_df.flt_departure_dt.dt.dayofyear
test_df['flt_departure_weekday'] = test_df.flt_departure_dt.dt.weekday
test_df['flt_departure_week'] = test_df.flt_departure_dt.dt.week
test_df['flt_departure_quarter'] = test_df.flt_departure_dt.dt.quarter

test_df['observation_date_dayofyear'] = test_df.observation_date.dt.dayofyear
test_df['observation_date_weekday'] = test_df.observation_date.dt.weekday
test_df['observation_date_week'] = test_df.observation_date.dt.week
test_df['observation_date_quarter'] = test_df.observation_date.dt.quarter

In [37]:
def get_part_of_day(hour):
    return (
    "morning" if 5 <= hour <= 11
    else
    "afternoon" if 12 <= hour <= 17
    else
    "evening" if 18 <= hour <= 22
    else
    "night"
        )

In [38]:
train_df['_flt_departure_local_time'] = train_df.flt_departure_local_time.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_arrival_local_time'] = train_df.flt_arrival_local_time.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_departure_gmt'] = train_df.flt_departure_gmt.apply(lambda x: get_part_of_day(x.hour))
train_df['_flt_arrival_gmt'] = train_df.flt_arrival_gmt.apply(lambda x: get_part_of_day(x.hour))

In [39]:
# test dataset

test_df['_flt_departure_local_time'] = test_df.flt_departure_local_time.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_arrival_local_time'] = test_df.flt_arrival_local_time.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_departure_gmt'] = test_df.flt_departure_gmt.apply(lambda x: get_part_of_day(x.hour))
test_df['_flt_arrival_gmt'] = test_df.flt_arrival_gmt.apply(lambda x: get_part_of_day(x.hour))

In [40]:
cols_to_drop = ['flt_departure_dt','observation_date','flt_departure_local_time','flt_arrival_local_time',
                'flt_departure_gmt','flt_arrival_gmt']

In [41]:
[train_df.drop(col,1,inplace=True) for col in cols_to_drop]

[None, None, None, None, None, None]

In [42]:
[test_df.drop(col,1,inplace=True) for col in cols_to_drop]

[None, None, None, None, None, None]

In [43]:
train_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,total_fare,origin_city,destination_city,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration,flt_departure_dayofyear,flt_departure_weekday,flt_departure_week,flt_departure_quarter,observation_date_dayofyear,observation_date_weekday,observation_date_quarter,observation_date_week,_flt_departure_local_time,_flt_arrival_local_time,_flt_departure_gmt,_flt_arrival_gmt
0,Airport4,Airport43,L1,5911,538,City4,City39,2018,1,3,2017,12,12,1649,322,2.616667,3,2,1,1,346,1,4,50,afternoon,evening,night,night
1,Airport4,Airport43,L1,5911,651,City4,City39,2018,1,3,2017,12,23,1649,322,2.616667,3,2,1,1,357,5,4,51,afternoon,evening,night,night
2,Airport4,Airport43,L1,5911,538,City4,City39,2018,1,3,2017,12,2,1649,322,2.616667,3,2,1,1,336,5,4,48,afternoon,evening,night,night
3,Airport4,Airport43,L1,5911,538,City4,City39,2018,1,3,2017,12,6,1649,322,2.616667,3,2,1,1,340,2,4,49,afternoon,evening,night,night
4,Airport4,Airport43,L1,5911,538,City4,City39,2018,1,3,2017,12,3,1649,322,2.616667,3,2,1,1,337,6,4,48,afternoon,evening,night,night


In [44]:
target = train_df['total_fare']
train_df.drop('total_fare',1,inplace=True)

In [45]:
X = pd.get_dummies(train_df, prefix_sep='_', drop_first=True)

In [46]:
X.head()

Unnamed: 0,flt_num,departure_yr,departure_mo,departure_dt,yr,mo,dt,scaled_demand,scaled_share,flight_travel_duration,flt_departure_dayofyear,flt_departure_weekday,flt_departure_week,flt_departure_quarter,observation_date_dayofyear,observation_date_weekday,observation_date_quarter,observation_date_week,origin_Airport20,origin_Airport26,origin_Airport30,origin_Airport31,origin_Airport4,origin_Airport43,origin_Airport60,destination_Airport20,destination_Airport26,destination_Airport30,destination_Airport31,destination_Airport4,destination_Airport43,destination_Airport60,carrier_L2,carrier_L3,origin_city_City19,origin_city_City24,origin_city_City27,origin_city_City28,origin_city_City39,origin_city_City4,origin_city_City56,destination_city_City19,destination_city_City24,destination_city_City27,destination_city_City28,destination_city_City39,destination_city_City4,destination_city_City56,_flt_departure_local_time_evening,_flt_departure_local_time_morning,_flt_departure_local_time_night,_flt_arrival_local_time_evening,_flt_arrival_local_time_morning,_flt_arrival_local_time_night,_flt_departure_gmt_evening,_flt_departure_gmt_morning,_flt_departure_gmt_night,_flt_arrival_gmt_evening,_flt_arrival_gmt_morning,_flt_arrival_gmt_night
0,5911,2018,1,3,2017,12,12,1649,322,2.616667,3,2,1,1,346,1,4,50,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1
1,5911,2018,1,3,2017,12,23,1649,322,2.616667,3,2,1,1,357,5,4,51,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1
2,5911,2018,1,3,2017,12,2,1649,322,2.616667,3,2,1,1,336,5,4,48,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1
3,5911,2018,1,3,2017,12,6,1649,322,2.616667,3,2,1,1,340,2,4,49,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1
4,5911,2018,1,3,2017,12,3,1649,322,2.616667,3,2,1,1,337,6,4,48,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1


In [47]:
X_ = pd.get_dummies(test_df, prefix_sep='_', drop_first=True)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.1, random_state=42)

In [49]:
import lightgbm as lgb
lgb_model = lgb.Booster(model_file='light_gbm.txt')
lgb_pred = lgb_model.predict(X_test)

In [50]:
import pickle
with open('randomforest_basemodel.pkl', 'rb') as f:
    rf = pickle.load(f)

In [51]:
rf_pred = rf.predict(X_test)

In [52]:
from catboost import CatBoostRegressor
cb_model = CatBoostRegressor()      # parameters not required.
cb_model.load_model('cb_best')

<catboost.core.CatBoostRegressor at 0x1318d5c50>

In [53]:
cb_pred = cb_model.predict(X_test)

In [54]:
'''
Finding optimal weights for model to get min RMSE on test set
'''

from itertools import permutations 
perm = permutations([0.4,0.4,0.2]) 
for i in list(perm):
    y = i[0]*rf_pred+i[1]*lgb_pred+i[2]*cb_pred
    print(i)
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y)))
    print('\n')


(0.4, 0.4, 0.2)
Root Mean Squared Error: 91.0538645626682


(0.4, 0.2, 0.4)
Root Mean Squared Error: 91.2079353458314


(0.4, 0.4, 0.2)
Root Mean Squared Error: 91.0538645626682


(0.4, 0.2, 0.4)
Root Mean Squared Error: 91.2079353458314


(0.2, 0.4, 0.4)
Root Mean Squared Error: 90.54036487181907


(0.2, 0.4, 0.4)
Root Mean Squared Error: 90.54036487181907




In [56]:
perm = permutations([0.5,0.3,0.2]) 
for i in list(perm):
    y = i[0]*rf_pred+i[1]*lgb_pred+i[2]*cb_pred
    print(i)
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y)))
    print('\n')



(0.5, 0.3, 0.2)
Root Mean Squared Error: 92.44639343307023


(0.5, 0.2, 0.3)
Root Mean Squared Error: 92.48577920017516


(0.3, 0.5, 0.2)
Root Mean Squared Error: 90.66354093525123


(0.3, 0.2, 0.5)
Root Mean Squared Error: 91.00697898090715


(0.2, 0.5, 0.3)
Root Mean Squared Error: 90.65196358695214


(0.2, 0.3, 0.5)
Root Mean Squared Error: 90.95541421488804




In [57]:
def ensemble(test):
    return 0.2*rf.predict(test)+0.4*lgb_model.predict(test)+0.4*cb_model.predict(test)
    
y_ensmbl = ensemble(X_test)

In [58]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_ensmbl))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_ensmbl))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_ensmbl)))

Mean Absolute Error: 46.89670755257803
Mean Squared Error: 8197.557671122127
Root Mean Squared Error: 90.54036487181907


In [59]:
target_y_test = ensemble(X_)

In [60]:
X_.shape , target_y_test.shape

((20397, 60), (20397,))

In [123]:
test_df.shape

(20397, 28)

In [78]:
test_merged_df.head()

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,origin_city,destination_city,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt
0,Airport4,Airport43,L2,7465,2019-01-01,2018-11-27,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0
1,Airport4,Airport43,L2,7465,2019-01-01,2018-11-28,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0
2,Airport4,Airport43,L2,7465,2019-01-01,2018-12-07,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0
3,Airport4,Airport43,L2,7465,2019-01-01,2018-12-05,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0
4,Airport4,Airport43,L2,7465,2019-01-01,2018-12-01,City4,City39,2019-01-01 08:04:00.0,2019-01-01 10:49:00.0,2019-01-01 14:04:00.0,2019-01-01 16:49:00.0


In [79]:
test_merged_df['total_fare']=target_y_test

In [80]:
test_merged_df.to_csv('test_fares_data_subir_verma.csv',index=False)

In [86]:
max(target),max(target_y_test)

(6354, 4725.621308840489)

In [87]:
np.mean(target),np.mean(target_y_test)

(734.6453275428173, 1711.2662028536988)

In [88]:
min(target),min(target_y_test)

(258, 401.0735701061491)