# Data Prep & Flight Prediction
In this notebook apply the pre-processing and Model 3 to the Test Data

In [8]:
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
from src.modules.data_preprocessing import time_of_day, feature_categorizer, make_regions

import warnings
warnings.filterwarnings('ignore')

In [39]:
flights_test = pd.read_csv('data/flights_test.csv')

In [40]:
flights_test.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,...,SFO,"San Francisco, CA",1810,1945,N,95,1,363,sunny,cloudy
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,...,SFO,"San Francisco, CA",1150,1320,N,90,1,363,sunny,cloudy


## Clean Features
Note: The weather "forecast" has already been added to the df. This was done in the Flights_test_weather notebook

In [41]:
pd.isnull(flights_test).sum()

fl_date                 0
mkt_unique_carrier      0
branded_code_share      0
mkt_carrier             0
mkt_carrier_fl_num      0
op_unique_carrier       0
tail_num              124
op_carrier_fl_num       0
origin_airport_id       0
origin                  0
origin_city_name        0
dest_airport_id         0
dest                    0
dest_city_name          0
crs_dep_time            0
crs_arr_time            0
dup                     0
crs_elapsed_time        0
flights                 0
distance                0
orig_weather_categ    851
dest_weather_categ    852
dtype: int64

In [42]:
flights_test.fillna(0, inplace = True)

In [43]:
flights_test.drop(['branded_code_share', 'mkt_carrier_fl_num', 'mkt_carrier', 'op_unique_carrier', 
                   'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'dup'], axis = 1, inplace = True)

In [44]:
flights_test.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ
0,2020-01-01,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1810,1945,95,1,363,sunny,cloudy
1,2020-01-01,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1150,1320,90,1,363,sunny,cloudy


### Add time of day features

In [45]:
flights_test['dep_time_of_day'] = flights_test.apply(lambda df: time_of_day(df, method='dep'), axis=1)
flights_test['arr_time_of_day'] = flights_test.apply(lambda df: time_of_day(df, method='arr'), axis=1)

In [46]:
feature_categorizer(flights_test[['dep_time_of_day','arr_time_of_day']], flights_test)

In [48]:
flights_test.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day
0,2020-01-01,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1810,1945,95,1,363,sunny,cloudy,0,0
1,2020-01-01,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1150,1320,90,1,363,sunny,cloudy,1,1


### Turn date time to year, month, day

In [49]:
flights_test.dtypes

fl_date               object
mkt_unique_carrier    object
origin                object
origin_city_name      object
dest                  object
dest_city_name        object
crs_dep_time           int64
crs_arr_time           int64
crs_elapsed_time       int64
flights                int64
distance               int64
orig_weather_categ    object
dest_weather_categ    object
dep_time_of_day        int64
arr_time_of_day        int64
dtype: object

In [50]:
flights_test['fl_date'] = pd.to_datetime(flights_test['fl_date'])

In [51]:
#split date into three cols
flights_test['year'] = flights_test['fl_date'].dt.year
flights_test['month'] = flights_test['fl_date'].dt.month
flights_test['day'] = flights_test['fl_date'].dt.day

In [52]:
flights_test.drop(['fl_date'],axis = 1, inplace = True)

In [53]:
flights_test.head(1)

Unnamed: 0,mkt_unique_carrier,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day
0,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1810,1945,95,1,363,sunny,cloudy,0,0,2020,1,1


## Add Regions

In [54]:
def o_state_creator(row):
    return str(row["origin_city_name"].split(', ')[1]) 
def d_state_creator(row):
    return str(row["dest_city_name"].split(', ')[1]) 

In [55]:
#adding state to the whole data frame
flights_test["orig_state_id"] = flights_test.apply(o_state_creator, axis = 1)    

In [56]:
flights_test["dest_state_id"] = flights_test.apply(d_state_creator, axis = 1)

In [57]:
flights_test.head(1)

Unnamed: 0,mkt_unique_carrier,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day,orig_state_id,dest_state_id
0,WN,ONT,"Ontario, CA",SFO,"San Francisco, CA",1810,1945,95,1,363,sunny,cloudy,0,0,2020,1,1,CA,CA


In [58]:
make_regions(flights_test, 'orig_state_id')
make_regions(flights_test, 'dest_state_id')

In [59]:
flights_test.drop(['origin_city_name', 'dest_city_name', 'orig_state_id', 'dest_state_id'], axis = 1, inplace = True)

In [60]:
flights_test.head(1)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day,orig_region,dest_region
0,WN,ONT,SFO,1810,1945,95,1,363,sunny,cloudy,0,0,2020,1,1,Pacific,Pacific


## Apply "Historical Data" Features
These features are based on the 2018-2019 training data

### Apply mean delay by airport feature to use in place of origin airport

In [61]:
#import mean delay by airport features
mean_delay_orig_airport = pd.read_csv('data/Features/mean_delay_orig_airport.csv')
mean_delay_dest_airport = pd.read_csv('data/Features/mean_delay_dest_airport.csv')

In [62]:
#merge this mean value to df with origin as key
flights_test = pd.merge(flights_test, mean_delay_orig_airport, how = 'left', on = ['origin'])

In [63]:
flights_test = pd.merge(flights_test, mean_delay_dest_airport, how = 'left', on = ['dest'])

In [64]:
flights_test.head(1)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport
0,WN,ONT,SFO,1810,1945,95,1,363,sunny,cloudy,0,0,2020,1,1,Pacific,Pacific,-3.666667,5.147465


### Apply mean delay by carrier feature

In [65]:
mean_delay_carrier = pd.read_csv('data/Features/mean_delay_carrier.csv')

In [66]:
#merge this mean value to flights_M1 with mkt unique carrier as key
flights_test = pd.merge(flights_test, mean_delay_carrier, how = 'left', on = ['mkt_unique_carrier'])

In [67]:
flights_test.head(1)

Unnamed: 0,mkt_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,WN,ONT,SFO,1810,1945,95,1,363,sunny,cloudy,0,0,2020,1,1,Pacific,Pacific,-3.666667,5.147465,3.940227


In [68]:
#now we can drop these categorical feature
flights_test.drop(['mkt_unique_carrier', 'origin', 'dest'], axis = 1, inplace = True)

### Weather & Region categorical features to ordinal int

In [69]:
flights_test.dtypes

crs_dep_time                     int64
crs_arr_time                     int64
crs_elapsed_time                 int64
flights                          int64
distance                         int64
orig_weather_categ              object
dest_weather_categ              object
dep_time_of_day                  int64
arr_time_of_day                  int64
year                             int64
month                            int64
day                              int64
orig_region                     object
dest_region                     object
mean_arr_delay_orig_airport    float64
mean_arr_delay_dest_airport    float64
mean_arr_delay_carrier         float64
dtype: object

In [70]:
# turn weather into ordinal
feature_categorizer(flights_test[['orig_weather_categ', 'dest_weather_categ']], flights_test)
feature_categorizer(flights_test[['orig_region', 'dest_region']], flights_test)

In [71]:
flights_test.head(1)

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,dep_time_of_day,arr_time_of_day,year,month,day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,1810,1945,95,1,363,0,0,0,0,2020,1,1,0,0,-3.666667,5.147465,3.940227


## Export df as csv to use in prediction model

In [72]:
flights_test.to_csv('data/flights_test_M3.csv', header=True, index=False)

In [73]:
flights_test.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ',
       'dep_time_of_day', 'arr_time_of_day', 'year', 'month', 'day',
       'orig_region', 'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

In [75]:
flights_test.shape

(150623, 17)