In [1]:
import pandas as pd
import numpy as np

# Import the tables

This csv is a pull of all the flights (date with arrival/departure airport)

In [2]:
flights = pd.read_csv('../../data/raw/date_origin_dest.csv')

# Create a column with the day/month/year of the flight
flights['Flight Weekday'] = pd.DatetimeIndex(flights['fl_date']).weekday   #0: Monday, 1:Tuesday, etc.
flights['Flight Day'] = pd.DatetimeIndex(flights['fl_date']).day
flights['Flight Month'] = pd.DatetimeIndex(flights['fl_date']).month
flights['Flight Year'] = pd.DatetimeIndex(flights['fl_date']).year
flights = flights.drop(labels = 'fl_date', axis = 1)

# Rename the origin and arrival columns of flights
flights = flights.rename(columns = {'origin': 'Origin Airport (IATA Code)', 'dest': 'Destination Airport (IATA Code)'})

What do we do with this? We count how many times an airport is an arrival/departure per day.

This will allow us to assess the busyness of every airport

Let's bring the provided 2020 data from the flights_test as well. We have it, let's use it

In [3]:
flights_test = pd.read_csv('../../data/raw/Cleaned-flights_test.csv')

flights_test = flights_test[['Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Origin Airport (IATA Code)','Destination Airport (IATA Code)']]

Let's merge both:

In [4]:
flights = pd.concat([flights, flights_test], ignore_index=True)

# Get a count of flights per airport and year, month, day

In [5]:
# Let's create a new dataframe with the counts
flights_count_orig = flights.groupby(['Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Origin Airport (IATA Code)']).count().reset_index()

# same for the destination
flights_count_dest = flights.groupby(['Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday', 'Destination Airport (IATA Code)']).count().reset_index()


In [6]:
# Let's rename the columns to make it easier to merge
flights_count_orig = flights_count_orig.rename(columns = {'Origin Airport (IATA Code)': 'Airport (IATA Code)', 'Destination Airport (IATA Code)': 'Flights Count'})
flights_count_dest = flights_count_dest.rename(columns = {'Destination Airport (IATA Code)': 'Airport (IATA Code)', 'Origin Airport (IATA Code)': 'Flights Count'})

In [7]:
# And now we can merge
flights_count = pd.concat([flights_count_orig, flights_count_dest], ignore_index=True)

In [8]:
flights_count

Unnamed: 0,Flight Year,Flight Month,Flight Day,Flight Weekday,Airport (IATA Code),Flights Count
0,2018,1,1,0,ABE,10
1,2018,1,1,0,ABI,6
2,2018,1,1,0,ABQ,80
3,2018,1,1,0,ABR,2
4,2018,1,1,0,ABY,2
...,...,...,...,...,...,...
539263,2020,1,31,4,XNA,48
539264,2020,1,31,4,XWA,5
539265,2020,1,31,4,YAK,2
539266,2020,1,31,4,YKM,4


# Create the busyness score

To get this score, we'll compare the amount of flights the airport has that day compared to its daily average

## Get the daily average

In [9]:
# Let's count how many days we have in the dataset

# VERY unproductive, but let's re-create a date column to with the year, month and day
flights_count['date'] = flights_count['Flight Year'].astype(str) + '-' + flights_count['Flight Month'].astype(str) + '-' + flights_count['Flight Day'].astype(str)

# now let's count how many unique dates we have
flights_count['date'].nunique()


761

In [10]:
#Counting total flights per airport
flights_per_airport = flights_count.groupby(['Airport (IATA Code)']).sum().reset_index()

# Removing as not needed:
flights_per_airport = flights_per_airport.drop(labels = ['Flight Year', 'Flight Month', 'Flight Day', 'Flight Weekday'], axis = 1)

# Let's create a new column with the average flights per day, dividing flights count by the number of days we have: 761
flights_per_airport['Average Flights Per Day'] = flights_per_airport['Flights Count'] / 761
flights_per_airport = flights_per_airport.drop(labels = 'Flights Count', axis = 1)

## Add daily average to flights_count and calculate busyness score

In [11]:
flights_count = pd.merge(flights_count, flights_per_airport, on = 'Airport (IATA Code)', how = 'left')
flights_count['Busyness Score'] = flights_count['Flights Count'] / flights_count['Average Flights Per Day']

In [12]:
flights_count

Unnamed: 0,Flight Year,Flight Month,Flight Day,Flight Weekday,Airport (IATA Code),Flights Count,date,Average Flights Per Day,Busyness Score
0,2018,1,1,0,ABE,10,2018-1-1,32.929041,0.303683
1,2018,1,1,0,ABI,6,2018-1-1,11.718791,0.511998
2,2018,1,1,0,ABQ,80,2018-1-1,148.130092,0.540066
3,2018,1,1,0,ABR,2,2018-1-1,4.086728,0.489389
4,2018,1,1,0,ABY,2,2018-1-1,5.529566,0.361692
...,...,...,...,...,...,...,...,...,...
539263,2020,1,31,4,XNA,48,2020-1-31,81.617608,0.588108
539264,2020,1,31,4,XWA,5,2020-1-31,1.503285,3.326049
539265,2020,1,31,4,YAK,2,2020-1-31,3.948752,0.506489
539266,2020,1,31,4,YKM,4,2020-1-31,6.925099,0.577609


What a beautiful busyness score!

# Create an ID column to easily add data as enrichment

In [13]:
# let's take the date field since we no longer need it.

flights_count['date'] = flights_count['date'] + '-' + flights_count['Airport (IATA Code)']
flights_count = flights_count.rename(columns = {'date': 'ID'})

# Export to CSV

In [14]:
flights_count.to_csv('../../data/processed/flights_enrichment_busyness_score.csv', index = False)