# Data Prep & Model 2
In this notebook we will build on the data from Model 1

In this model we will not use any one hot encoded variables, but ordinal variables instead

In [20]:
# Import sys so we can import custom packages without error
import sys
sys.path.append('../')

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.modules.data_preprocessing import time_of_day, feature_categorizer, make_regions

import warnings
warnings.filterwarnings('ignore')

In [2]:
flights = pd.read_csv('../data/flights_M1.csv')

In [3]:
flights.head(2)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,arr_delay,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region
0,WN,SAT,BNA,705,915,-2.0,130.0,1.0,822.0,cloudy,cloudy,2018,7,10,0,0,West-South Central,East-South Central
1,UA,ORD,MCO,800,1148,-9.0,168.0,1.0,1005.0,rain,rain,2019,10,10,0,0,East-North Central,South-Atlantic


# Split the data into Train and Test
there are a few features we want to create that should only be created from the test data

In [50]:
X = flights.drop(['arr_delay'], axis = 1)
y = flights['arr_delay']

In [51]:
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [52]:
X_train_M2 = X_train

## Create Features based on only train data

### Create mean delay by airport feature to use in place of origin airport

In [4]:
mean_delay_orig_airport = flights[['origin', 'arr_delay']].groupby(by = 'origin').mean().reset_index()

In [5]:
#rename aggregate column
mean_delay_orig_airport.rename({'arr_delay':'mean_arr_delay_orig_airport'},axis = 1, inplace= True)

In [55]:
#merge this mean value to flights_M1 with origin as key
X_train_M2 = pd.merge(X_train_M2, mean_delay_orig_airport, how = 'left', on = ['origin'])

In [6]:
#mean delay for dest airport
mean_delay_dest_airport = flights[['dest', 'arr_delay']].groupby(by = 'dest').mean().reset_index()
mean_delay_dest_airport.rename({'arr_delay':'mean_arr_delay_dest_airport'},axis = 1, inplace= True)

In [57]:
#merge this mean value to flights_M1 with dest as key
X_train_M2 = pd.merge(X_train_M2, mean_delay_dest_airport, how = 'left', on = ['dest'])

In [7]:
#export these features to use on test data
mean_delay_orig_airport.to_csv('data/Features/mean_delay_orig_airport.csv', header=True, index=False)

In [8]:
mean_delay_dest_airport.to_csv('data/Features/mean_delay_dest_airport.csv', header=True, index=False)

### Create mean delay by carrier feature

In [9]:
#mean delay for carrier
mean_delay_carrier = flights[['mkt_unique_carrier', 'arr_delay']].groupby(by = 'mkt_unique_carrier').mean().reset_index()

In [10]:
mean_delay_carrier.rename({'arr_delay':'mean_arr_delay_carrier'},axis = 1, inplace= True)

In [11]:
#merge this mean value to flights_M1 with mkt unique carrier as key
X_train_M2 = pd.merge(X_train_M2, mean_delay_carrier, how = 'left', on = ['mkt_unique_carrier'])

NameError: name 'X_train_M2' is not defined

In [None]:
X_train_M2.head(1)

In [62]:
#now we can drop these categorical feature
X_train_M2.drop(['mkt_unique_carrier', 'origin', 'dest'], axis = 1, inplace = True)

In [12]:
mean_delay_carrier.to_csv('data/Features/mean_delay_carrier.csv', header=True, index=False)

### Weather & Region categorical features to ordinal int
In model 1, this feature was one-hot encoded

In [63]:
X_train_M2.dtypes    

crs_dep_time                     int64
crs_arr_time                     int64
crs_elapsed_time               float64
flights                        float64
distance                       float64
orig_weather_categ              object
dest_weather_categ              object
year                             int64
month                            int64
day                              int64
dep_time_of_day                  int64
arr_time_of_day                  int64
orig_region                     object
dest_region                     object
mean_arr_delay_orig_airport    float64
mean_arr_delay_dest_airport    float64
mean_arr_delay_carrier         float64
dtype: object

In [64]:
# turn weather into ordinal
feature_categorizer(X_train_M2[['orig_weather_categ', 'dest_weather_categ']], X_train_M2)

In [65]:
feature_categorizer(X_train_M2[['orig_region', 'dest_region']], X_train_M2)

In [66]:
X_train_M2.head(1)

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,801,850,109.0,1.0,533.0,0,0,2018,12,16,0,0,0,0,-1.652778,7.98615,8.249349


# Apply same process to test set

keep only the features we would have 1 week before the flight:

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance'],
      dtype='object')

In [67]:
X_test_M2 = X_test

In [68]:
X_test_M2.head(2)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region
6252,UA,COS,DEN,738,840,62.0,1.0,73.0,sunny,sunny,2019,12,25,0,0,South-West,South-West
4684,F9,HSV,MCO,1917,2207,110.0,1.0,535.0,rain,sunny,2019,10,22,3,1,East-South Central,South-Atlantic


### Assign mean delay by airport feature from train data set

In [69]:
#merge this mean value to test set with origin as key
X_test_M2 = pd.merge(X_test_M2, mean_delay_orig_airport, how = 'left', on = ['origin'])

In [70]:
mean_delay_dest_airport.head(2)

Unnamed: 0,dest,mean_arr_delay_dest_airport
0,ABE,20.5
1,ABI,9.6


In [71]:
#merge this mean value to test set with dest as key
X_test_M2 = pd.merge(X_test_M2, mean_delay_dest_airport, how = 'left', on = ['dest'])

In [72]:
X_test_M2.head(2)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport
0,UA,COS,DEN,738,840,62.0,1.0,73.0,sunny,sunny,2019,12,25,0,0,South-West,South-West,1.5625,7.98615
1,F9,HSV,MCO,1917,2207,110.0,1.0,535.0,rain,sunny,2019,10,22,3,1,East-South Central,South-Atlantic,8.142857,6.542373


### Assign mean delay by carrier feature (created from train set)

In [73]:
#merge this mean value to test set with mkt unique carrier as key
X_test_M2 = pd.merge(X_test_M2, mean_delay_carrier, how = 'left', on = ['mkt_unique_carrier'])

In [74]:
X_test_M2.head(1)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,UA,COS,DEN,738,840,62.0,1.0,73.0,sunny,sunny,2019,12,25,0,0,South-West,South-West,1.5625,7.98615,8.249349


In [75]:
#now we can drop the 'mkt_unique_carrier' categorical feature
X_test_M2.drop(['mkt_unique_carrier', 'origin', 'dest'], axis = 1, inplace = True)

In [76]:
X_test_M2.head(1)

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,738,840,62.0,1.0,73.0,sunny,sunny,2019,12,25,0,0,South-West,South-West,1.5625,7.98615,8.249349


In [77]:
X_test_M2.dtypes

crs_dep_time                     int64
crs_arr_time                     int64
crs_elapsed_time               float64
flights                        float64
distance                       float64
orig_weather_categ              object
dest_weather_categ              object
year                             int64
month                            int64
day                              int64
dep_time_of_day                  int64
arr_time_of_day                  int64
orig_region                     object
dest_region                     object
mean_arr_delay_orig_airport    float64
mean_arr_delay_dest_airport    float64
mean_arr_delay_carrier         float64
dtype: object

In [78]:
X_test_M2.head(2)

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,738,840,62.0,1.0,73.0,sunny,sunny,2019,12,25,0,0,South-West,South-West,1.5625,7.98615,8.249349
1,1917,2207,110.0,1.0,535.0,rain,sunny,2019,10,22,3,1,East-South Central,South-Atlantic,8.142857,6.542373,16.312883


### Convert weather & region categories to ordinal variables

In [79]:
# turn weather into ordinal
feature_categorizer(X_test_M2[['orig_weather_categ', 'dest_weather_categ']],X_test_M2)

In [80]:
feature_categorizer(X_test_M2[['orig_region', 'dest_region']], X_test_M2)

In [81]:
X_test_M2.head(1)

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,738,840,62.0,1.0,73.0,0,0,2019,12,25,0,0,0,0,1.5625,7.98615,8.249349


## Export Model 2 as csv

In [None]:
#BRANDON RUN THESE PLZZZ

In [84]:
X_train_M2.to_csv('../data/X_train_M2.csv', header=True, index=False)

In [85]:
X_test_M2.to_csv('../data/X_test_M2.csv', header=True, index=False)

In [86]:
y_train.to_csv('../data/y_train_M2.csv', header=True, index=False)

In [87]:
y_test.to_csv('../data/y_test_M2.csv', header=True, index=False)

In [40]:
X_train_M2.shape

(8000, 17)

In [88]:
X_test_M2.shape

(2000, 17)

In [89]:
X_train_M2.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

In [90]:
X_test_M2.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

### Run Model 2

In [91]:
from src.modules.data_preprocessing import regression

In [92]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [93]:
regressor_list = [LinearRegression, SVR, RandomForestRegressor]

In [94]:
regression(X_train_M2, X_test_M2, y_train, y_test, regressor_list)

Regressor: <class 'sklearn.linear_model._base.LinearRegression'> 
 MSE = 2423.013414368402 
 R2 = -0.03145121931070882 
 ----------------------------
Regressor: <class 'sklearn.svm._classes.SVR'> 
 MSE = 2396.197858828824 
 R2 = -0.02003612053586301 
 ----------------------------
Regressor: <class 'sklearn.ensemble._forest.RandomForestRegressor'> 
 MSE = 2603.2027809500005 
 R2 = -0.10815592955510533 
 ----------------------------
