# Import Flight Datasets & Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", 120)

In [2]:
flights_sample_train= pd.read_csv('../../data/raw/Cleaned-flights_sample_training.csv')
flights_sample_test= pd.read_csv('../../data/raw/Cleaned-flights_sample_testing.csv')

flights_test= pd.read_csv('../../data/raw/Cleaned-flights_test.csv')

Let's keep the first 7x days of Jan 2020 for the test, as requested.

In [3]:
flights_test = flights_test[(flights_test['Flight Year'] == 2020) & (flights_test['Flight Month'] == 1) & (flights_test['Flight Day'] <= 7)]

# Add enrichment data

## Weather

In [4]:
enr_weather = pd.read_csv("../../data/processed/flights_enrichment_weather.csv")

Add the weather of the departure airports

In [5]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'])


Add the weather of the destination airports

In [6]:
flights_sample_train = flights_sample_train.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_sample_test = flights_sample_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))
flights_test = flights_test.merge(enr_weather, how='left', left_on=['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)'], right_on=['Year', 'Month', 'Day', 'iata_code'], suffixes=('_origin', '_destination'))

In [7]:
flights_sample_test = flights_sample_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_sample_train = flights_sample_train.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)
flights_test = flights_test.drop(['Year_origin', 'Month_origin', 'Day_origin', 'iata_code_origin', 'Station_origin', 'Year_destination', 'Month_destination', 'Day_destination', 'iata_code_destination', 'Station_destination'], axis=1)

## Additional airport details

In [8]:
enr_airport = pd.read_csv("../../data/processed/flights_enrichment_airportLocation.csv")

#Rename the columns
enr_airport = enr_airport.rename(columns={"type":"Airport Type", "iata_code": "Airport (IATA Code)", "latitude_deg": "Latitude", "longitude_deg": "Longitude", "local_region": "Region", "country_name": "Country", "elevation_ft": "Elevation (ft)"})

# Drop unrelevant columns
enr_airport = enr_airport.drop(columns=["local_code", "name", "Region", "Country"])

enr_airport = enr_airport.dropna(subset=["Airport (IATA Code)"])

Add to departure airports

In [10]:
flights_sample_train = flights_sample_train.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = flights_sample_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = flights_test.merge(enr_airport, how='left', left_on='Origin Airport (IATA Code)', right_on='Airport (IATA Code)')

Add to destination airports

In [11]:
flights_sample_train = pd.merge(flights_sample_train, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_sample_test = pd.merge(flights_sample_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')
flights_test = pd.merge(flights_test, enr_airport, how='left', left_on='Destination Airport (IATA Code)', right_on='Airport (IATA Code)')

Some final cleanup:

In [12]:
flights_sample_train = flights_sample_train.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_train = flights_sample_train.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_sample_test = flights_sample_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_sample_test = flights_sample_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

flights_test = flights_test.drop(['Airport (IATA Code)_x', 'Airport (IATA Code)_y', 'Latitude_x', 'Latitude_y', 'Longitude_x', 'Longitude_y', 'Elevation (ft)_x', 'Elevation (ft)_y'], axis=1)
flights_test = flights_test.rename(columns={'Airport Type_x': 'Airport Type_origin', 'Airport Type_y': 'Airport Type_destination'})

## Airport Busyness Score

In [13]:
busyness_scores = pd.read_csv('../../data/processed/flights_enrichment_busyness_score.csv')

Create an ID column for each arrival / departure airports

In [15]:
flights_sample_train['OriginID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Origin Airport (IATA Code)'].astype(str)
flights_sample_train['ArrivalID'] = flights_sample_train['Flight Year'].astype(str) + "-" + flights_sample_train['Flight Month'].astype(str) + "-" +  flights_sample_train['Flight Day'].astype(str) + "-" +  flights_sample_train['Destination Airport (IATA Code)'].astype(str)

flights_sample_test['OriginID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Origin Airport (IATA Code)'].astype(str)
flights_sample_test['ArrivalID'] = flights_sample_test['Flight Year'].astype(str) + "-" + flights_sample_test['Flight Month'].astype(str) + "-" +  flights_sample_test['Flight Day'].astype(str) + "-" +  flights_sample_test['Destination Airport (IATA Code)'].astype(str)

flights_test['OriginID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Origin Airport (IATA Code)'].astype(str)
flights_test['ArrivalID'] = flights_test['Flight Year'].astype(str) + "-" + flights_test['Flight Month'].astype(str) + "-" +  flights_test['Flight Day'].astype(str) + "-" +  flights_test['Destination Airport (IATA Code)'].astype(str)

Add the data to the flights

In [19]:
flights_sample_train = flights_sample_train.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_train = flights_sample_train.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_train = flights_sample_train.drop(['ID_x', 'ID_y'], axis=1)

flights_sample_test = flights_sample_test.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_sample_test = flights_sample_test.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_sample_test = flights_sample_test.drop(['ID_x', 'ID_y'], axis=1)

flights_test = flights_test.merge(busyness_scores, how='left', left_on='OriginID', right_on='ID')
flights_test = flights_test.merge(busyness_scores, how='left', left_on='ArrivalID', right_on='ID')
flights_test = flights_test.drop(['ID_x', 'ID_y'], axis=1)

Let's do a bit of cleaning

In [22]:
# Drop some columns
flights_sample_test = flights_sample_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)
flights_sample_train = flights_sample_train.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)
flights_test = flights_test.drop(['Flight Year_y', 'Flight Month_y', 'Flight Day_y', 'Airport (IATA Code)_y', 'Flight Year_x', 'Flight Month_x', 'Flight Day_x', 'Airport (IATA Code)_x'], axis=1)

#rename the average flights, busyness score columns and flights count columns x to origin and y to destination
flights_sample_test = flights_sample_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_sample_train = flights_sample_train.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})
flights_test = flights_test.rename(columns={'Flights Count_x': 'Flights Count_origin', 'Average Flights Per Day_x': 'Average Flights Per Day_origin', 'Busyness Score_x': 'Busyness Score_origin', 'Flights Count_y': 'Flights Count_destination', 'Average Flights Per Day_y': 'Average Flights Per Day_destination', 'Busyness Score_y': 'Busyness Score_destination'})

#delete origin and arrival id columns
flights_sample_test = flights_sample_test.drop(['OriginID', 'ArrivalID'], axis=1)
flights_sample_train = flights_sample_train.drop(['OriginID', 'ArrivalID'], axis=1)
flights_test = flights_test.drop(['OriginID', 'ArrivalID'], axis=1)


In [25]:
flights_sample_test = flights_sample_test.drop_duplicates()

## Avg Payload and passengers

In [28]:
passengers = pd.read_csv("../../data/processed/flights_enrichment_avgpayload_passengers.csv", index_col=False)

# Cleaning
passengers = passengers.rename(columns={"averagepayload_lbs": "Average payload (lbs)", "availableseats": "Average number of available seats", "distanceinterval_x500mi": "Distance interval (x500mi)", "aircraftgroup": "Aircraft group", "aircrafttype": "Aircraft type", "aircraftconfiguration": "Aircraft configuration", "serviceclass": "Service class"})
passengers = passengers.drop(columns=["Unnamed: 0"])

In [29]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170839 entries, 0 to 170838
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   routeid                               170839 non-null  object 
 1   Proportion of freight to the payload  48469 non-null   float64
 2   Proportion of mail to the payload     11770 non-null   float64
 3   Proportion of filled seats            124271 non-null  float64
 4   Average payload (lbs)                 170221 non-null  float64
 5   Average number of available seats     144940 non-null  float64
 6   Distance interval (x500mi)            68999 non-null   float64
 7   Aircraft group                        170839 non-null  object 
 8   Aircraft type                         170839 non-null  object 
 9   Aircraft configuration                170839 non-null  object 
 10  Service class                         170839 non-null  object 
dtype

In [30]:
# Create routeid column (used to add enrichment)
flights_sample_train['routeid'] = flights_sample_train['Operator - Unique Carrier Code'] + '-' + flights_sample_train['Origin Airport (IATA Code)'] + '-' + flights_sample_train['Destination Airport (IATA Code)']
flights_sample_test['routeid'] = flights_sample_test['Operator - Unique Carrier Code'] + '-' + flights_sample_test['Origin Airport (IATA Code)'] + '-' + flights_sample_test['Destination Airport (IATA Code)']
flights_test['routeid'] = flights_test['Operator - Unique Carrier Code'] + '-' + flights_test['Origin Airport (IATA Code)'] + '-' + flights_test['Destination Airport (IATA Code)']

In [31]:
# Adding the enrichment
flights_sample_train = flights_sample_train.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_sample_test = flights_sample_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))
flights_test = flights_test.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))

# Drop the routeid column
flights_sample_train = flights_sample_train.drop(columns=["routeid"])
flights_sample_test = flights_sample_test.drop(columns=["routeid"])
flights_test = flights_test.drop(columns=["routeid"])

## Reordering the columns

In [32]:
flights_sample_train = flights_sample_train[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_sample_test = flights_sample_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination', 'Departure Delay (minutes)', 'Arrival Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)', 'cancelled', 'cancellation_code', 'diverted']]

flights_test = flights_test[['Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Different Marketer & Operator Carrier Code', 'Tail Number', 'Flight Number', 'Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Aircraft group', 'Aircraft type', 'Aircraft configuration', 'Service class', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Average payload (lbs)', 'Average number of available seats', 'Distance interval (x500mi)', 'Distance (miles)', 'Scheduled Elapsed Time', 'Origin Airport (IATA Code)', 'Airport Type_origin', 'Flights Count_origin', 'Average Flights Per Day_origin', 'Busyness Score_origin', 'Scheduled Departure Time (local time)', 'Scheduled hour of departure', 'Precipitation (mm)_origin', 'Snowfall (mm)_origin', 'Maximum Temperature (*C)_origin', 'Avg Pressure for the day (hPa)_origin', 'Avg Wind Speed (m/s)_origin', 'Avg Humidity (%)_origin', 'Fog_origin', 'Thunder_origin', 'Smoke_or_Haze_origin', 'Destination Airport (IATA Code)', 'Airport Type_destination', 'Flights Count_destination', 'Average Flights Per Day_destination', 'Busyness Score_destination', 'Scheduled Arrival Time (local time)', 'Scheduled hour of arrival', 'Precipitation (mm)_destination', 'Snowfall (mm)_destination', 'Maximum Temperature (*C)_destination', 'Avg Pressure for the day (hPa)_destination', 'Avg Wind Speed (m/s)_destination', 'Avg Humidity (%)_destination', 'Fog_destination', 'Thunder_destination', 'Smoke_or_Haze_destination']]


# Variable Transformation

In [33]:
flights_sample_train.describe()

Unnamed: 0,Different Marketer & Operator Carrier Code,Flight Number,Flight Year,Flight Month,Flight Day,Flight Weekday,Proportion of freight to the payload,Proportion of mail to the payload,Proportion of filled seats,Average payload (lbs),Average number of available seats,Distance interval (x500mi),Distance (miles),Scheduled Elapsed Time,Flights Count_origin,Average Flights Per Day_origin,Busyness Score_origin,Scheduled hour of departure,Precipitation (mm)_origin,Snowfall (mm)_origin,Maximum Temperature (*C)_origin,Avg Pressure for the day (hPa)_origin,Avg Wind Speed (m/s)_origin,Avg Humidity (%)_origin,Fog_origin,Thunder_origin,Smoke_or_Haze_origin,Flights Count_destination,Average Flights Per Day_destination,Busyness Score_destination,Scheduled hour of arrival,Precipitation (mm)_destination,Snowfall (mm)_destination,Maximum Temperature (*C)_destination,Avg Pressure for the day (hPa)_destination,Avg Wind Speed (m/s)_destination,Avg Humidity (%)_destination,Fog_destination,Thunder_destination,Smoke_or_Haze_destination,Departure Delay (minutes),Arrival Delay (minutes),Carrier Delay (minutes),Weather Delay (minutes),National Air System Delay (minutes),Security Delay (minutes),Late Aircraft Delay (minutes),cancelled,cancellation_code,diverted
count,150186.0,150186.0,150186.0,150186.0,150186.0,150186.0,101830.0,48217.0,150185.0,150186.0,150186.0,148524.0,150186.0,150186.0,150186.0,150186.0,150186.0,150186.0,148376.0,138732.0,144150.0,121530.0,136428.0,121746.0,148435.0,16166.0,21003.0,150186.0,150186.0,150186.0,150186.0,148342.0,138436.0,144029.0,121256.0,136030.0,121452.0,148413.0,16309.0,20604.0,150186.0,150186.0,150186.0,150186.0,150186.0,150186.0,150186.0,150186.0,0.0,150186.0
mean,0.371013,2709.544791,2018.505946,6.605762,15.731766,2.946779,0.006213379,0.006985877,0.810856,30670.764031,119.073558,2.049278,774.746588,138.533698,797.402694,787.668325,1.019182,11.531661,6.26866,1.967325,21.820026,984.958483,3.711833,83.834089,0.373463,1.0,1.0,799.880202,790.210436,1.020055,11.518404,6.326539,2.013183,21.836741,985.204984,3.715255,83.866474,0.37236,1.0,1.0,3.281691,-2.125311,1.560898,0.165761,1.852969,0.008077,2.21771,0.0,,0.0
std,0.483078,1858.231741,0.499966,3.403742,8.776042,2.000475,0.04621574,0.009926675,0.083715,13700.551611,47.363779,1.207365,591.922235,72.369006,652.136016,637.514883,0.142159,6.465447,14.025136,14.84273,10.483108,47.46229,1.61532,15.950562,0.557608,0.0,0.0,652.452218,638.077602,0.149934,6.651541,14.11718,15.129469,10.487281,47.373724,1.617355,15.961175,0.555538,0.0,0.0,17.685426,21.172493,7.501426,2.55892,7.833571,0.488542,9.520809,0.0,,0.0
min,0.0,1.0,2018.0,1.0,1.0,0.0,5.797774e-07,3.771074e-07,0.089862,7219.638889,30.0,1.0,31.0,20.0,1.0,0.436268,0.055477,0.0,0.0,0.0,-32.1,750.1,0.0,0.0,0.0,1.0,1.0,1.0,0.396846,0.048212,0.0,0.0,0.0,-31.7,752.5,0.0,11.0,0.0,1.0,1.0,-57.0,-80.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
25%,0.0,1112.0,2018.0,4.0,8.0,1.0,0.0006474505,0.0003134704,0.770572,17951.824851,71.886886,1.0,342.0,87.0,258.0,269.352168,0.964512,6.0,0.0,0.0,14.4,981.7,2.6,79.0,0.0,1.0,1.0,257.25,269.352168,0.965435,6.0,0.0,0.0,14.4,982.1,2.6,79.0,0.0,1.0,1.0,-6.0,-15.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
50%,0.0,2294.0,2019.0,7.0,16.0,3.0,0.00211876,0.00317993,0.825655,34600.0,142.189654,2.0,606.0,120.0,702.0,773.09724,1.031501,11.0,0.3,0.0,23.3,999.7,3.4,89.0,0.0,1.0,1.0,709.5,773.09724,1.031647,12.0,0.3,0.0,23.3,999.7,3.4,89.0,0.0,1.0,1.0,-3.0,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
75%,1.0,4268.75,2019.0,10.0,23.0,5.0,0.005889189,0.01026604,0.868873,38266.666667,154.978797,3.0,1009.0,169.0,1076.0,1037.202365,1.084083,17.0,5.8,0.0,30.0,1013.5,4.6,94.0,1.0,1.0,1.0,1079.0,1037.202365,1.08427,17.0,5.8,0.0,30.0,1013.5,4.6,94.0,1.0,1.0,1.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
max,1.0,9391.0,2019.0,12.0,31.0,6.0,3.61958,0.1845016,0.972009,125000.0,344.0,11.0,5095.0,1567.0,2533.0,2159.176084,6.652098,23.0,602.0,599.0,53.9,1040.6,14.7,100.0,2.0,1.0,1.0,2533.0,2159.176084,7.559603,23.0,412.8,599.0,53.9,1040.6,17.0,100.0,2.0,1.0,1.0,134.0,86.0,86.0,86.0,86.0,76.0,86.0,0.0,,0.0


### Drop

### Bin

### Scale

### Dummy Variables

### Rename

### Handling NaNs (To be completed)

In [31]:
#dataCleaning(flights_sample, code=True, tips=False, orientation=False, formatIssues=False, missingValues=True, duplicateValues=False, outliers=False)

# Export to CSV

In [34]:
# Export 3x to csv
flights_sample_train.to_csv('../../data/processed/Enriched-flights_sample_train.csv', index=False)
flights_sample_test.to_csv('../../data/processed/Enriched-flights_sample_test.csv', index=False)
flights_test.to_csv('../../data/processed/Enriched-flights_test.csv', index=False)

In [35]:
flights_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150623 entries, 0 to 150622
Data columns (total 53 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Marketer - Unique Carrier Code              150623 non-null  object 
 1   Operator - Unique Carrier Code              150623 non-null  object 
 2   Different Marketer & Operator Carrier Code  150623 non-null  int64  
 3   Tail Number                                 150499 non-null  object 
 4   Flight Number                               150623 non-null  int64  
 5   Flight Year                                 150623 non-null  int64  
 6   Flight Month                                150623 non-null  int64  
 7   Flight Day                                  150623 non-null  int64  
 8   Flight Weekday                              150623 non-null  int64  
 9   Aircraft group                              150585 non-null  object 
 