In [155]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
import time
from sklearn.metrics import mean_squared_error

In [138]:
df = pd.read_csv('data/Merged_Table_1.csv')

In [139]:
df

Unnamed: 0.1,Unnamed: 0,index,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,origin_city_name,dest_city_name,crs_dep_time,dep_delay,taxi_out,...,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance,Trip_count_PerDay,Trip_count_Permonth,Trip_count_PerYear,origin_city_condn,dest_city_condn
0,0,0,2018-01-01,WN,5705,"Tampa, FL","Albany, NY",15,28.0,12.0,...,0,160,153.0,134.0,1130,1.001351,30.875000,61.750000,low_rainny,sunny
1,1,1,2018-01-01,WN,988,"New Orleans, LA","Houston, TX",6,-2.0,9.0,...,0,75,69.0,57.0,302,17.332432,534.416667,1068.833333,cloundy,sunny
2,2,2,2018-01-01,WN,1236,"New Orleans, LA","Orlando, FL",5,-1.0,8.0,...,0,100,86.0,70.0,551,5.255405,162.041667,324.083333,cloundy,low_rainny
3,3,3,2018-01-01,WN,233,"New Orleans, LA","Oakland, CA",15,16.0,8.0,...,0,290,283.0,271.0,1903,0.959459,29.583333,59.166667,cloundy,sunny
4,4,4,2018-01-01,WN,446,"New Orleans, LA","St. Louis, MO",18,3.0,6.0,...,0,105,125.0,83.0,604,1.981081,61.083333,122.166667,cloundy,sunny
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14595,14595,14595,2019-12-31,WN,5451,"New York, NY","Kansas City, MO",18,94.0,7.0,...,0,205,165.0,155.0,1107,5.506757,169.791667,339.583333,cloundy,sunny
14596,14596,14596,2019-12-31,WN,2038,"New York, NY","Chicago, IL",11,-4.0,13.0,...,0,160,165.0,108.0,725,54.166216,1670.125000,3340.250000,cloundy,low_snow
14597,14597,14597,2019-12-31,WN,3255,"New York, NY","New Orleans, LA",12,0.0,16.0,...,0,215,198.0,179.0,1183,8.470270,261.166667,522.333333,cloundy,sunny
14598,14598,14598,2019-12-31,WN,4880,"New York, NY","Tampa, FL",8,10.0,12.0,...,0,185,174.0,158.0,1010,12.725676,392.375000,784.750000,cloundy,sunny


In [140]:
columns = ['mkt_carrier_fl_num','dest_city_condn','origin_city_condn','Unnamed: 0','index','Trip_count_PerDay','Trip_count_Permonth','Trip_count_PerYear']

In [141]:
df.drop(columns=columns, inplace=True)

In [142]:
df.columns

Index(['fl_date', 'mkt_unique_carrier', 'origin_city_name', 'dest_city_name',
       'crs_dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on',
       'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'distance'],
      dtype='object')

In [143]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder,OneHotEncoder


In [144]:
le = OneHotEncoder(
    categories='auto',  # Categories per feature
    drop=None, # Whether to drop one of the features
    sparse=True, # Will return sparse matrix if set True
    handle_unknown='error' # Whether to raise an error 
)

In [145]:
cols = ['fl_date', 'mkt_unique_carrier','origin_city_name','dest_city_name']

In [146]:
class MultiColumnLabelEncoder:

    def __init__(self, columns=None):
        self.columns = columns # array of column names to encode


    def fit(self, X, y=None):
        self.encoders = {}
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            self.encoders[col] = LabelEncoder().fit(X[col])
        return self


    def transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].transform(X[col])
        return output


    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X)


    def inverse_transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].inverse_transform(X[col])
        return output



In [147]:
df.head(5)

Unnamed: 0,fl_date,mkt_unique_carrier,origin_city_name,dest_city_name,crs_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance
0,2018-01-01,WN,"Tampa, FL","Albany, NY",15,28.0,12.0,1610.0,1824.0,7.0,18,1831.0,21.0,0,0,160,153.0,134.0,1130
1,2018-01-01,WN,"New Orleans, LA","Houston, TX",6,-2.0,9.0,632.0,729.0,3.0,7,732.0,-8.0,0,0,75,69.0,57.0,302
2,2018-01-01,WN,"New Orleans, LA","Orlando, FL",5,-1.0,8.0,522.0,732.0,8.0,7,740.0,-15.0,0,0,100,86.0,70.0,551
3,2018-01-01,WN,"New Orleans, LA","Oakland, CA",15,16.0,8.0,1554.0,1825.0,4.0,18,1829.0,9.0,0,0,290,283.0,271.0,1903
4,2018-01-01,WN,"New Orleans, LA","St. Louis, MO",18,3.0,6.0,1814.0,1937.0,36.0,19,2013.0,23.0,0,0,105,125.0,83.0,604


In [148]:
multi = MultiColumnLabelEncoder(columns=['fl_date','mkt_unique_carrier','origin_city_name','dest_city_name'])

In [149]:
basemodel = multi.fit_transform(df)

In [150]:
basemodel.head(5)

Unnamed: 0,fl_date,mkt_unique_carrier,origin_city_name,dest_city_name,crs_dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance
0,0,9,269,4,15,28.0,12.0,1610.0,1824.0,7.0,18,1831.0,21.0,0,0,160,153.0,134.0,1130
1,0,9,194,125,6,-2.0,9.0,632.0,729.0,3.0,7,732.0,-8.0,0,0,75,69.0,57.0,302
2,0,9,194,210,5,-1.0,8.0,522.0,732.0,8.0,7,740.0,-15.0,0,0,100,86.0,70.0,551
3,0,9,194,205,15,16.0,8.0,1554.0,1825.0,4.0,18,1829.0,9.0,0,0,290,283.0,271.0,1903
4,0,9,194,274,18,3.0,6.0,1814.0,1937.0,36.0,19,2013.0,23.0,0,0,105,125.0,83.0,604


In [151]:
Sc = StandardScaler()
X = basemodel.drop(columns = ['arr_delay', 'dep_delay'])
y = basemodel['arr_delay']
X = Sc.fit_transform(X)

# # Splitting Data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.80,test_size=0.20, random_state=42)

In [152]:
# setting the base estimators as the base models and final estimator with a linear regression with 5 cross folds.
base_models = [
    ('Linear Regression',LinearRegression()),
    ('SVR',SVR()),
    ('Random Forest',RandomForestRegressor()),
    ]
stacked = StackingRegressor(
    estimators = base_models,
    final_estimator = LinearRegression(),
    cv = 5)

In [153]:
# Base Model 
for name, model in base_models:
    start_time = time.time()
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    end_time = time.time()
    
    r2 = model.score(X_test, y_test)
    print("-------{}-------".format(name))
    print("Coefficient of determination: {}".format(r2))
    print("Computation Time: {}".format(end_time - start_time))
    print("----------------------------------\n")
start_time = time.time()

-------Linear Regression-------
Coefficient of determination: 0.0644774873062447
Computation Time: 0.010107040405273438
----------------------------------

-------SVR-------
Coefficient of determination: 0.029225960440592225
Computation Time: 4.891067981719971
----------------------------------

-------Random Forest-------
Coefficient of determination: 0.5915243801714816
Computation Time: 8.039703130722046
----------------------------------



In [156]:
# Stacked Model 
start_time = time.time()
stacked.fit(X_train, y_train)    
stacked_prediction = stacked.predict(X_test)
end_time = time.time()
stacked_r2 = stacked.score(X_test, y_test)
stacked_rmse = mean_squared_error(y_test, stacked_prediction, squared = False)
print("-------Stacked Ensemble-------")
print("Coefficient of determination: {}".format(stacked_r2))
print("Computation Time: {}".format(end_time - start_time))
print("----------------------------------")

-------Stacked Ensemble-------
Coefficient of determination: 0.6158905532923278
Computation Time: 59.99668502807617
----------------------------------
