In [1]:
import pandas as pd
import numpy as np

# Import the tables

This csv is a pull of all the flights (date with arrival/departure airport)

In [2]:
flights = pd.read_csv('../../data/raw/date_origin_dest.csv')

# Create a column with the day/month/year of the flight
flights['Flight Weekday'] = pd.DatetimeIndex(flights['fl_date']).weekday   #0: Monday, 1:Tuesday, etc.
flights['Flight Day'] = pd.DatetimeIndex(flights['fl_date']).day
flights['Flight Month'] = pd.DatetimeIndex(flights['fl_date']).month
flights['Flight Year'] = pd.DatetimeIndex(flights['fl_date']).year
flights = flights.drop(labels = 'fl_date', axis = 1)

# Rename the origin and arrival columns of flights
flights = flights.rename(columns = {'origin': 'Origin Airport (IATA Code)', 'dest': 'Destination Airport (IATA Code)'})

In [3]:
flights.shape

(15927485, 6)

What do we do with this? We count how many times an airport is an arrival/departure per day.

This will allow us to assess the busyness of every airport

Let's bring the provided 2020 data from the flights_test as well. We have it, let's use it

In [4]:
flights_test = pd.read_csv('../../data/raw/Cleaned-flights_test.csv')

flights_test = flights_test[['Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Origin Airport (IATA Code)','Destination Airport (IATA Code)']]

Let's merge both:

In [5]:
flights = pd.concat([flights, flights_test], ignore_index=True)

# Get a count of flights per airport and year, month, day

In [6]:
# Let's create a new dataframe with the counts
flights_count_orig = flights.groupby(['Flight Year', 'Flight Month', 'Flight Day', 'Origin Airport (IATA Code)']).count().reset_index()

# same for the destination
flights_count_dest = flights.groupby(['Flight Year', 'Flight Month', 'Flight Day', 'Destination Airport (IATA Code)']).count().reset_index()


In [7]:
# Let's rename the columns to make it easier to merge
flights_count_orig = flights_count_orig.rename(columns = {'Origin Airport (IATA Code)': 'Airport (IATA Code)', 'Destination Airport (IATA Code)': 'Flights Count'})
flights_count_dest = flights_count_dest.rename(columns = {'Destination Airport (IATA Code)': 'Airport (IATA Code)', 'Origin Airport (IATA Code)': 'Flights Count'})

In [8]:
# And now we can merge
flights_count = pd.concat([flights_count_orig, flights_count_dest], ignore_index=True)

del flights_count['Flight Weekday']

# For some reason the merging doesn't work well.. Have to run this to merge them correctly
flights_count = flights_count.groupby(['Flight Year', 'Flight Month', 'Flight Day', 'Airport (IATA Code)']).sum().reset_index()

# Create the busyness score

To get this score, we'll compare the amount of flights the airport has that day compared to its daily average

## Get the daily average

In [9]:
# Let's count how many days we have in the dataset

# VERY unproductive, but let's re-create a date column to with the year, month and day
flights_count['date'] = flights_count['Flight Year'].astype(str) + '-' + flights_count['Flight Month'].astype(str) + '-' + flights_count['Flight Day'].astype(str)

# now let's count how many unique dates we have
flights_count['date'].nunique()


761

In [11]:
#Counting total flights per airport
flights_per_airport = flights_count.groupby(['Airport (IATA Code)']).sum().reset_index()

# Removing as not needed:
flights_per_airport = flights_per_airport.drop(labels = ['Flight Year', 'Flight Month', 'Flight Day'], axis = 1)

# Let's create a new column with the average flights per day, dividing flights count by the number of days we have: 761
flights_per_airport['Average Flights Per Day'] = flights_per_airport['Flights Count'] / 761
flights_per_airport = flights_per_airport.drop(labels = 'Flights Count', axis = 1)

## Add daily average to flights_count and calculate busyness score

In [12]:
flights_count = pd.merge(flights_count, flights_per_airport, on = 'Airport (IATA Code)', how = 'left')
flights_count['Busyness Score'] = flights_count['Flights Count'] / flights_count['Average Flights Per Day']

In [14]:
flights_count.describe()

Unnamed: 0,Flight Year,Flight Month,Flight Day,Flights Count,Average Flights Per Day,Busyness Score
count,269819.0,269819.0,269819.0,269819.0,269819.0,269819.0
mean,2018.567036,6.324847,15.737009,122.956804,122.819091,1.066115
std,0.572136,3.537505,8.801901,288.34622,286.414244,1.228215
min,2018.0,1.0,1.0,1.0,0.005256,0.022427
25%,2018.0,3.0,8.0,6.0,6.023653,0.91148
50%,2019.0,6.0,16.0,22.0,22.415243,1.018623
75%,2019.0,9.0,23.0,80.0,81.617608,1.113643
max,2020.0,12.0,31.0,2533.0,2159.176084,380.5


What a beautiful busyness score! And our mean is right on 1, which makes sense

# Create an ID column to easily add data as enrichment

In [15]:
# let's take the date field since we no longer need it.

flights_count['date'] = flights_count['date'] + '-' + flights_count['Airport (IATA Code)']
flights_count = flights_count.rename(columns = {'date': 'ID'})

# Export to CSV

In [16]:
flights_count.to_csv('../../data/processed/flights_enrichment_busyness_score.csv', index = False)