# Import Flight Datasets & Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 120)

In [2]:
flights_sample_train= pd.read_csv('../../data/raw/Cleaned-flights_sample_training.csv')
flights_sample_test= pd.read_csv('../../data/raw/Cleaned-flights_sample_testing.csv')

flights_test= pd.read_csv('../../data/raw/Cleaned-flights_test.csv')

Let's keep the first 7x days of Jan 2020 for the test, as requested.

In [3]:
flights_sample_train.shape

(150117, 27)

In [4]:
flights_test = flights_test[(flights_test['Flight Year'] == 2020) & (flights_test['Flight Month'] == 1) & (flights_test['Flight Day'] <= 7)]

# Add enrichment data

## Weather

In [5]:
enr_weather = pd.read_csv("../../data/processed/flights_enrichment_weather.csv")

Add the weather of the departure airports

In [6]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])


Add the weather of the destination airports

In [7]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))

In [8]:
flights_sample_test = flights_sample_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_sample_train = flights_sample_train.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_test = flights_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)

## Additional airport details

In [9]:
enr_airport = pd.read_csv("../../data/processed/flights_enrichment_airportLocation.csv")

#Rename the columns
enr_airport = enr_airport.rename(columns={"type":"Airport Type", "iata_code": "Airport (IATA Code)", "latitude_deg": "Latitude", "longitude_deg": "Longitude", "local_region": "Region", "country_name": "Country", "elevation_ft": "Elevation (ft)"})

# Drop unrelevant columns
enr_airport = enr_airport.drop(columns=["local_code", "name", "Region", "Country"])

enr_airport = enr_airport.dropna(subset=["Airport (IATA Code)"])

Add to departure airports

In [10]:
flights_sample_train = flights_sample_train.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = flights_sample_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = flights_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')

Add to destination airports

In [11]:
flights_sample_train = pd.merge(flights_sample_train, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = pd.merge(flights_sample_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = pd.merge(flights_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')

Some final cleanup:

In [12]:
flights_sample_train = flights_sample_train.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_train = flights_sample_train.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_sample_test = flights_sample_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_test = flights_sample_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_test = flights_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_test = flights_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

## Airport Busyness Score

In [13]:
busyness_scores = pd.read_csv('../../data/processed/flights_enrichment_busyness_score.csv')

Create an ID column for each arrival / departure airports

In [14]:
flights_sample_train['OriginID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Origin Airport (IATA Code)'].astype(str)
flights_sample_train['ArrivalID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Destination Airport (IATA Code)'].astype(str)

flights_sample_test['OriginID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Origin Airport (IATA Code)'].astype(str)
flights_sample_test['ArrivalID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Destination Airport (IATA Code)'].astype(str)

flights_test['OriginID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Origin Airport (IATA Code)'].astype(str)
flights_test['ArrivalID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Destination Airport (IATA Code)'].astype(str)

Add the data to the flights

In [15]:
flights_sample_train = flights_sample_train.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_train = flights_sample_train.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_train = flights_sample_train.drop(['ID_x', 'ID_y'], axis=1)

flights_sample_test = flights_sample_test.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_test = flights_sample_test.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_test = flights_sample_test.drop(['ID_x', 'ID_y'], axis=1)

flights_test = flights_test.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_test = flights_test.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_test = flights_test.drop(['ID_x', 'ID_y'], axis=1)

Let's do a bit of cleaning

In [16]:
# Drop some columns
flights_sample_test = flights_sample_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)
flights_sample_train = flights_sample_train.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)
flights_test = flights_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)

#rename the average flights, busyness score columns and flights count columns x to origin and y to destination
flights_sample_test = flights_sample_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_sample_train = flights_sample_train.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_test = flights_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})

#delete origin and arrival id columns
flights_sample_test = flights_sample_test.drop(['OriginID', 'ArrivalID'], axis=1)
flights_sample_train = flights_sample_train.drop(['OriginID', 'ArrivalID'], axis=1)
flights_test = flights_test.drop(['OriginID', 'ArrivalID'], axis=1)


In [17]:
flights_sample_test = flights_sample_test.drop_duplicates()

## Avg Payload and passengers

In [18]:
passengers = pd.read_csv("../../data/processed/flights_enrichment_avgpayload_passengers.csv", index_col=False)

# Cleaning
passengers = passengers.rename(columns={"averagepayload_lbs": "Average payload (lbs)", "availableseats": "Average number of available seats", "distanceinterval_x500mi": "Distance interval (x500mi)", "aircraftgroup": "Aircraft group", "aircrafttype": "Aircraft type", "aircraftconfiguration": "Aircraft configuration", "serviceclass": "Service class"})
passengers = passengers.drop(columns=["Unnamed: 0"])

In [19]:
# switch Nan values to 0 for proportion of freight and mail
passengers["Proportion of freight to the payload"] = passengers["Proportion of freight to the payload"].fillna(0)
passengers["Proportion of mail to the payload"] = passengers["Proportion of mail to the payload"].fillna(0)

In [20]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170839 entries, 0 to 170838
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   routeid                               170839 non-null  object 
 1   Proportion of freight to the payload  170839 non-null  float64
 2   Proportion of mail to the payload     170839 non-null  float64
 3   Proportion of filled seats            124271 non-null  float64
 4   Average payload (lbs)                 170221 non-null  float64
 5   Average number of available seats     144940 non-null  float64
 6   Distance interval (x500mi)            68999 non-null   float64
 7   Aircraft group                        170839 non-null  object 
 8   Aircraft type                         170839 non-null  object 
 9   Aircraft configuration                170839 non-null  object 
 10  Service class                         170839 non-null  object 
dtype

In [21]:
# Create routeid column (used to add enrichment)
flights_sample_train['routeid'] = flights_sample_train['Operator - Unique Carrier Code'] + '-' + flights_sample_train['Origin Airport (IATA Code)'] + '-' + flights_sample_train['Destination Airport (IATA Code)']
flights_sample_test['routeid'] = flights_sample_test['Operator - Unique Carrier Code'] + '-' + flights_sample_test['Origin Airport (IATA Code)'] + '-' + flights_sample_test['Destination Airport (IATA Code)']
flights_test['routeid'] = flights_test['Operator - Unique Carrier Code'] + '-' + flights_test['Origin Airport (IATA Code)'] + '-' + flights_test['Destination Airport (IATA Code)']

In [22]:
# Adding the enrichment
flights_sample_train = flights_sample_train.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_sample_test = flights_sample_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_test = flights_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))

# Drop the routeid column
flights_sample_train = flights_sample_train.drop(columns=["routeid"])
flights_sample_test = flights_sample_test.drop(columns=["routeid"])
flights_test = flights_test.drop(columns=["routeid"])

## Reordering the columns

In [23]:
flights_sample_train = flights_sample_train[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_sample_test = flights_sample_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_test = flights_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination']]


# Variable Transformation

In [24]:
flights_sample_train.describe()

Unnamed: 0,Different Marketer & Operator Carrier Code,Flight Number,Flight Year,Flight Month,Flight Day,Flight Weekday,Proportion of freight to the payload,Proportion of mail to the payload,Proportion of filled seats,Average payload (lbs),Average number of available seats,Distance interval (x500mi),Distance (miles),Scheduled Elapsed Time,Flights Count_origin,Average Flights Per Day_origin,Busyness Score_origin,Scheduled hour of departure,Precipitation (mm)_origin,Snowfall (mm)_origin,Maximum Temperature (*C)_origin,Avg Pressure for the day (hPa)_origin,Avg Wind Speed (m/s)_origin,Avg Humidity (%)_origin,Fog_origin,Thunder_origin,Smoke_or_Haze_origin,Flights Count_destination,Average Flights Per Day_destination,Busyness Score_destination,Scheduled hour of arrival,Precipitation (mm)_destination,Snowfall (mm)_destination,Maximum Temperature (*C)_destination,Avg Pressure for the day (hPa)_destination,Avg Wind Speed (m/s)_destination,Avg Humidity (%)_destination,Fog_destination,Thunder_destination,Smoke_or_Haze_destination,Departure Delay (minutes),Arrival Delay (minutes),Carrier Delay (minutes),Weather Delay (minutes),National Air System Delay (minutes),Security Delay (minutes),Late Aircraft Delay (minutes),cancelled,cancellation_code,diverted
count,150117.0,150117.0,150117.0,150117.0,150117.0,150117.0,150117.0,150117.0,150116.0,150117.0,150117.0,148455.0,150117.0,150117.0,150117.0,150117.0,150117.0,150117.0,148307.0,138667.0,144081.0,121468.0,136359.0,121684.0,148366.0,148366.0,148366.0,150117.0,150117.0,150117.0,150117.0,148273.0,138375.0,143963.0,121203.0,135967.0,121399.0,148344.0,148344.0,148344.0,150117.0,150117.0,23423.0,23423.0,23423.0,23423.0,23423.0,150117.0,0.0,150117.0
mean,0.370724,2708.9231,2018.506179,6.60658,15.731956,2.946675,0.004215,0.002244,0.810872,30676.257107,119.097048,2.049557,774.88521,138.547187,797.369505,787.640805,1.01918,11.530879,6.267828,1.967476,21.820818,984.95216,3.711583,83.830627,0.373401,0.108886,0.141508,799.915939,790.248653,1.020055,11.517849,6.325825,2.012668,21.836662,985.200288,3.715122,83.863953,0.37229,0.10986,0.138853,3.283053,-2.124296,10.008325,1.062844,11.876532,0.051787,14.219741,0.0,,0.0
std,0.483,1858.359755,0.499963,3.404033,8.77602,2.000596,0.038174,0.006503,0.08371,13701.13056,47.361568,1.207517,592.001976,72.379495,652.097556,637.489201,0.142165,6.465143,14.021532,14.844437,10.483141,47.472245,1.615264,15.952721,0.557577,0.311497,0.348546,652.412505,638.041766,0.149952,6.651172,14.116894,15.131107,10.4875,47.382215,1.617427,15.962394,0.555469,0.312716,0.345794,17.689078,21.175551,16.621417,6.405749,16.560992,1.236181,20.26228,0.0,,0.0
min,0.0,1.0,2018.0,1.0,1.0,0.0,0.0,0.0,0.089862,7219.638889,30.0,1.0,31.0,20.0,1.0,0.436268,0.055477,0.0,0.0,0.0,-32.1,750.1,0.0,0.0,0.0,0.0,0.0,1.0,0.396846,0.048212,0.0,0.0,0.0,-31.7,752.5,0.0,11.0,0.0,0.0,0.0,-57.0,-80.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
25%,0.0,1112.0,2018.0,4.0,8.0,1.0,0.0,0.0,0.77059,17951.824851,71.916667,1.0,342.0,87.0,258.0,269.352168,0.964512,6.0,0.0,0.0,14.4,981.7,2.6,79.0,0.0,0.0,0.0,258.0,269.352168,0.965435,6.0,0.0,0.0,14.4,982.1,2.6,79.0,0.0,0.0,0.0,-6.0,-15.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
50%,0.0,2293.0,2019.0,7.0,16.0,3.0,0.000695,0.0,0.825695,34600.0,142.195652,2.0,606.0,120.0,702.0,773.09724,1.031445,11.0,0.3,0.0,23.3,999.7,3.4,89.0,0.0,0.0,0.0,710.0,773.09724,1.031647,12.0,0.3,0.0,23.3,999.7,3.4,89.0,0.0,0.0,0.0,-3.0,-7.0,0.0,0.0,3.0,0.0,0.0,0.0,,0.0
75%,1.0,4268.0,2019.0,10.0,23.0,5.0,0.003818,0.000215,0.868874,38266.666667,154.978797,3.0,1009.0,169.0,1076.0,1037.202365,1.084083,17.0,5.8,0.0,30.0,1013.5,4.6,94.0,1.0,0.0,0.0,1079.0,1037.202365,1.08427,17.0,5.8,0.0,30.0,1013.5,4.6,94.0,1.0,0.0,0.0,4.0,5.0,15.0,0.0,19.0,0.0,23.0,0.0,,0.0
max,1.0,9391.0,2019.0,12.0,31.0,6.0,3.61958,0.184502,0.972009,125000.0,344.0,11.0,5095.0,1567.0,2533.0,2159.176084,6.652098,23.0,602.0,599.0,53.9,1040.6,14.7,100.0,2.0,1.0,1.0,2533.0,2159.176084,7.559603,23.0,412.8,599.0,53.9,1040.6,17.0,100.0,2.0,1.0,1.0,134.0,86.0,86.0,86.0,86.0,76.0,86.0,0.0,,0.0


### Turn relevant object values into dummy variables

In [25]:
## Flight_sample_train
# Operator - Unique Carrier Code
flights_sample_train['Operator - Unique Carrier Code2'] = flights_sample_train['Operator - Unique Carrier Code']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Operator - Unique Carrier Code2'], drop_first=False)

# aircraft group
flights_sample_train['Aircraft group2'] = flights_sample_train['Aircraft group']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Aircraft group2'], drop_first=False)

# aircraft type
flights_sample_train['Aircraft type2'] = flights_sample_train['Aircraft type']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Aircraft type2'], drop_first=False)

# aircraft configuration
flights_sample_train['Aircraft configuration2'] = flights_sample_train['Aircraft configuration']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Aircraft configuration2'], drop_first=False)

# service class
flights_sample_train['Service class2'] = flights_sample_train['Service class']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Service class2'], drop_first=False)

# Airport Type_origin
flights_sample_train['Airport Type_origin2'] = flights_sample_train['Airport Type_origin']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Airport Type_origin2'], drop_first=False)

# Airport Type_destination
flights_sample_train['Airport Type_destination2'] = flights_sample_train['Airport Type_destination']
flights_sample_train = pd.get_dummies(flights_sample_train, columns=['Airport Type_destination2'], drop_first=False)

In [26]:
## Flight_sample_test
# Operator - Unique Carrier Code
flights_sample_test['Operator - Unique Carrier Code2'] = flights_sample_test['Operator - Unique Carrier Code']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Operator - Unique Carrier Code2'], drop_first=False)

# aircraft group
flights_sample_test['Aircraft group2'] = flights_sample_test['Aircraft group']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Aircraft group2'], drop_first=False)

# aircraft type
flights_sample_test['Aircraft type2'] = flights_sample_test['Aircraft type']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Aircraft type2'], drop_first=False)

# aircraft configuration
flights_sample_test['Aircraft configuration2'] = flights_sample_test['Aircraft configuration']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Aircraft configuration2'], drop_first=False)

# service class
flights_sample_test['Service class2'] = flights_sample_test['Service class']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Service class2'], drop_first=False)

# Airport Type_origin
flights_sample_test['Airport Type_origin2'] = flights_sample_test['Airport Type_origin']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Airport Type_origin2'], drop_first=False)

# Airport Type_destination
flights_sample_test['Airport Type_destination2'] = flights_sample_test['Airport Type_destination']
flights_sample_test = pd.get_dummies(flights_sample_test, columns=['Airport Type_destination2'], drop_first=False)

In [27]:
## Flight_test
# Operator - Unique Carrier Code
flights_test['Operator - Unique Carrier Code2'] = flights_test['Operator - Unique Carrier Code']
flights_test = pd.get_dummies(flights_test, columns=['Operator - Unique Carrier Code2'], drop_first=False)

# aircraft group
flights_test['Aircraft group2'] = flights_test['Aircraft group']
flights_test = pd.get_dummies(flights_test, columns=['Aircraft group2'], drop_first=False)

# aircraft type
flights_test['Aircraft type2'] = flights_test['Aircraft type']
flights_test = pd.get_dummies(flights_test, columns=['Aircraft type2'], drop_first=False)

# aircraft configuration
flights_test['Aircraft configuration2'] = flights_test['Aircraft configuration']
flights_test = pd.get_dummies(flights_test, columns=['Aircraft configuration2'], drop_first=False)

# service class
flights_test['Service class2'] = flights_test['Service class']
flights_test = pd.get_dummies(flights_test, columns=['Service class2'], drop_first=False)

# Airport Type_origin
flights_test['Airport Type_origin2'] = flights_test['Airport Type_origin']
flights_test = pd.get_dummies(flights_test, columns=['Airport Type_origin2'], drop_first=False)

# Airport Type_destination
flights_test['Airport Type_destination2'] = flights_test['Airport Type_destination']
flights_test = pd.get_dummies(flights_test, columns=['Airport Type_destination2'], drop_first=False)

### Scale

### Handling NaNs (To be completed)

In [28]:
#dataCleaning(flights_sample, code=True, tips=False, orientation=False, formatIssues=False, missingValues=True, duplicateValues=False, outliers=False)

# Export to CSV

In [29]:
# Export 3x to csv
flights_sample_train.to_csv('../../data/processed/Enriched-flights_sample_train.csv', index=False)
flights_sample_test.to_csv('../../data/processed/Enriched-flights_sample_test.csv', index=False)

flights_test.to_csv('../../data/processed/Enriched-flights_test.csv', index=False)

In [30]:
flights_sample_train.shape

(150117, 141)

# Playing around

In [31]:
def getOLS(x,y,Option = 0):
    """
    This function will provide you with OLS Regression Results for your dataset.

    Recommended nomenclature for x:
    df.drop(['y_column'], axis = 1) 
    
    Start with your entire DF minus your y value. 
    You can then easily add values here as you drop them.

    Recommended nomenclature for y:
    df['y_column']

    By default, the function will print the model, but if you want to do anything further, modify accordingly using Option for different scenarios.
    """
    import pandas as pd
    import statsmodels.api as sm
    
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    predictions = model.predict(x)
    print_model = model.summary()

    if Option == 0:
        print(print_model)

In [32]:
def runPCA (data, componentsNumber):
    # Import required librairies
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import scale

    # Set data to A
    A = data

    # Scale the data
    A = scale(A, with_std=True)  #Std scaler
    ## can also use a min/max scaler

    # Run a PCA over the data
    pca = PCA(n_components=componentsNumber) #change to the number of components you want to keep
    pca.fit(A)
    A_pca = pca.transform(A)

    return A_pca

In [33]:
X=flights_sample_train.drop(['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Tail Number', 'Flight Number', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Distance (miles)', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Scheduled Departure Time (local time)', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Scheduled Arrival Time (local time)', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted'], axis=1)
X=X.fillna(0)
y=flights_sample_train['Arrival Delay (minutes)']
y=y.fillna(0)


getOLS(X,y)
#getOLS(runPCA(X,.99),y)


                               OLS Regression Results                              
Dep. Variable:     Arrival Delay (minutes)   R-squared:                       0.055
Model:                                 OLS   Adj. R-squared:                  0.055
Method:                      Least Squares   F-statistic:                     84.39
Date:                     Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                             04:51:48   Log-Likelihood:            -6.6702e+05
No. Observations:                   150117   AIC:                         1.334e+06
Df Residuals:                       150012   BIC:                         1.335e+06
Df Model:                              104                                         
Covariance Type:                 nonrobust                                         
                                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------