# Import Flight Datasets & Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 120)

In [2]:
flights_sample_train= pd.read_csv('../../data/raw/Cleaned-flights_sample_training.csv')
flights_sample_test= pd.read_csv('../../data/raw/Cleaned-flights_sample_testing.csv')

flights_test= pd.read_csv('../../data/raw/Cleaned-flights_test.csv')

Let's keep the first 7x days of Jan 2020 for the test, as requested.

In [3]:
flights_test = flights_test[(flights_test['Flight Year'] == 2020) & (flights_test['Flight Month'] == 1) & (flights_test['Flight Day'] <= 7)]

# STEP 3 - Feature Engineering

## 3.1 - Importing enrichment data

### 3.1a: Import weather info

In [4]:
enr_weather = pd.read_csv("../../data/processed/flights_enrichment_weather.csv")

Add the weather of the departure airports

In [5]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])


Add the weather of the destination airports

In [6]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))

In [7]:
flights_sample_test = flights_sample_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_sample_train = flights_sample_train.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_test = flights_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)

### 3.1b: Import additional airport details

In [8]:
enr_airport = pd.read_csv("../../data/processed/flights_enrichment_airportLocation.csv")

#Rename the columns
enr_airport = enr_airport.rename(columns={"type":"Airport Type", "iata_code": "Airport (IATA Code)", "latitude_deg": "Latitude", "longitude_deg": "Longitude", "local_region": "Region", "country_name": "Country", "elevation_ft": "Elevation (ft)"})

# Drop unrelevant columns
enr_airport = enr_airport.drop(columns=["local_code", "name", "Region", "Country"])

enr_airport = enr_airport.dropna(subset=["Airport (IATA Code)"])

In [9]:
enr_airport

Unnamed: 0,Airport (IATA Code),Latitude,Longitude,Elevation (ft),Airport Type
0,LHR,51.470600,-0.461941,83.0,large_airport
1,LAX,33.942501,-118.407997,125.0,large_airport
2,ORD,41.978600,-87.904800,672.0,large_airport
3,JFK,40.639447,-73.779317,13.0,large_airport
4,ATL,33.636700,-84.428101,1026.0,large_airport
...,...,...,...,...,...
35564,MLV,-13.064900,142.453600,276.0,small_airport
35577,MEP,2.389148,103.874183,10.0,small_airport
35596,ETE,12.933000,36.167000,2650.0,small_airport
35687,MDO,59.449902,-146.307007,100.0,small_airport


Add to departure airports

In [10]:
flights_sample_train = flights_sample_train.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = flights_sample_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = flights_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')

Add to destination airports

In [11]:
flights_sample_train = pd.merge(flights_sample_train, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = pd.merge(flights_sample_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = pd.merge(flights_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')

Some final cleanup:

In [12]:
flights_sample_train = flights_sample_train.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_train = flights_sample_train.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_sample_test = flights_sample_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_test = flights_sample_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_test = flights_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_test = flights_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

### 3.1c: Add Airport Busyness Score

In [13]:
busyness_scores = pd.read_csv('../../data/processed/flights_enrichment_busyness_score.csv')

Create an ID column for each arrival / departure airports

In [14]:
flights_sample_train['OriginID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Origin Airport (IATA Code)'].astype(str)
flights_sample_train['ArrivalID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Destination Airport (IATA Code)'].astype(str)

flights_sample_test['OriginID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Origin Airport (IATA Code)'].astype(str)
flights_sample_test['ArrivalID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Destination Airport (IATA Code)'].astype(str)

flights_test['OriginID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Origin Airport (IATA Code)'].astype(str)
flights_test['ArrivalID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Destination Airport (IATA Code)'].astype(str)

Add the data to the flights

In [15]:
flights_sample_train = pd.merge(flights_sample_train, busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_train = pd.merge(flights_sample_train, busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_train = flights_sample_train.drop(['ID_x', 'ID_y'], axis=1)

flights_sample_test = pd.merge(flights_sample_test, busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_test = pd.merge(flights_sample_test, busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_test = flights_sample_test.drop(['ID_x', 'ID_y'], axis=1)

flights_test = pd.merge(flights_test, busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_test = pd.merge(flights_test, busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_test = flights_test.drop(['ID_x', 'ID_y'], axis=1)

Let's do a bit of cleaning

In [16]:
# remove  the x and y columns
flights_sample_test = flights_sample_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Flight Weekday_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Flight Weekday_x', 'Airport (IATA Code)_x'], axis=1)
flights_sample_train = flights_sample_train.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Flight Weekday_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Flight Weekday_x', 'Airport (IATA Code)_x'], axis=1)
flights_test = flights_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Flight Weekday_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Flight Weekday_x', 'Airport (IATA Code)_x'], axis=1)

#rename the average flights, busyness score columns and flights count columns x to origin and y to destination
flights_sample_test = flights_sample_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_sample_train = flights_sample_train.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_test = flights_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})

#delete origin and arrival id columns
flights_sample_test = flights_sample_test.drop(['OriginID', 'ArrivalID'], axis=1)
flights_sample_train = flights_sample_train.drop(['OriginID', 'ArrivalID'], axis=1)
flights_test = flights_test.drop(['OriginID', 'ArrivalID'], axis=1)


### Add additional loading and passenger details

In [17]:
passengers = pd.read_csv("../../data/processed/flights_enrichment_avgpayload_passengers.csv", index_col=False)

# Cleaning
passengers = passengers.rename(columns={"averagepayload_lbs": "Average payload (lbs)", "availableseats": "Average number of available seats", "distanceinterval_x500mi": "Distance interval (x500mi)", "aircraftgroup": "Aircraft group", "aircrafttype": "Aircraft type", "aircraftconfiguration": "Aircraft configuration", "serviceclass": "Service class"})
passengers = passengers.drop(columns=["Unnamed: 0"])

In [18]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170839 entries, 0 to 170838
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   routeid                               170839 non-null  object 
 1   Proportion of freight to the payload  48469 non-null   float64
 2   Proportion of mail to the payload     11770 non-null   float64
 3   Proportion of filled seats            124271 non-null  float64
 4   Average payload (lbs)                 170221 non-null  float64
 5   Average number of available seats     144940 non-null  float64
 6   Distance interval (x500mi)            68999 non-null   float64
 7   Aircraft group                        170839 non-null  object 
 8   Aircraft type                         170839 non-null  object 
 9   Aircraft configuration                170839 non-null  object 
 10  Service class                         170839 non-null  object 
dtype

In [19]:
# Create routeid column (used to add enrichment)
flights_sample_train['routeid'] = flights_sample_train['Operator - Unique Carrier Code'] + '-' + flights_sample_train['Origin Airport (IATA Code)'] + '-' + flights_sample_train['Destination Airport (IATA Code)']
flights_sample_test['routeid'] = flights_sample_test['Operator - Unique Carrier Code'] + '-' + flights_sample_test['Origin Airport (IATA Code)'] + '-' + flights_sample_test['Destination Airport (IATA Code)']
flights_test['routeid'] = flights_test['Operator - Unique Carrier Code'] + '-' + flights_test['Origin Airport (IATA Code)'] + '-' + flights_test['Destination Airport (IATA Code)']

In [20]:
# Adding the enrichment
flights_sample_train = flights_sample_train.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_sample_test = flights_sample_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_test = flights_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))

# Drop the routeid column
flights_sample_train = flights_sample_train.drop(columns=["routeid"])
flights_sample_test = flights_sample_test.drop(columns=["routeid"])
flights_test = flights_test.drop(columns=["routeid"])

### Reordering the columns

In [21]:
flights_sample_train = flights_sample_train[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_sample_test = flights_sample_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_test = flights_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination']]


## 3.2 - Variable Transformation

### Drop

### Bin

### Scale

### Dummy Variables

### Rename

### Handling NaNs (To be completed)

In [22]:
#dataCleaning(flights_sample, code=True, tips=False, orientation=False, formatIssues=False, missingValues=True, duplicateValues=False, outliers=False)

# Export to CSV

In [23]:
# Export 3x to csv
flights_sample_train.to_csv('../../data/processed/Enriched-flights_sample_train.csv', index=False)
flights_sample_test.to_csv('../../data/processed/Enriched-flights_sample_test.csv', index=False)
flights_test.to_csv('../../data/processed/Enriched-flights_test.csv', index=False)