In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio   
pio.kaleido.scope.mathjax = None

# Importing Dataset

Dataset Link - https://www.kaggle.com/datasets/yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018

In [2]:
example = pd.read_csv('data/processed/2010-17_all_labels.csv')

#drop the unnamed column
# coverting FL_DATE to datetime object
example['FL_DATE'] = pd.to_datetime(example['FL_DATE'])

example

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2010-12-31,OO,6439,CWA,EAU,2305.0,1939.0,-206.0,15.0,1954.0,...,0.0,38.0,52.0,32.0,90.0,,,,,
1,2010-06-16,AS,66,YAK,JNU,1845.0,1822.0,-23.0,4.0,1826.0,...,0.0,48.0,38.0,30.0,199.0,,,,,
2,2010-02-10,OO,4551,RAP,SLC,1645.0,1624.0,-21.0,16.0,1640.0,...,0.0,106.0,102.0,80.0,508.0,,,,,
3,2010-01-01,OH,6593,DCA,IND,2000.0,1940.0,-20.0,7.0,1947.0,...,0.0,120.0,92.0,77.0,499.0,,,,,
4,2010-07-01,UA,828,JAC,DEN,1506.0,1447.0,-19.0,14.0,1501.0,...,0.0,90.0,79.0,60.0,406.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47913,2017-12-08,WN,1606,SJC,SAN,1720.0,,,,,...,0.0,75.0,,,417.0,,,,,
47914,2017-12-08,OO,4851,ATL,MOB,2334.0,,,,,...,0.0,80.0,,,302.0,,,,,
47915,2017-12-14,OO,4533,BZN,SLC,1307.0,,,,,...,0.0,88.0,,,347.0,,,,,
47916,2017-12-17,DL,2184,SDF,ATL,1659.0,,,,,...,0.0,94.0,,,321.0,,,,,


## Columns

In [14]:
example.columns

Index(['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'],
      dtype='object')

## Average time of delay grouped by airline

In [4]:
# group by airline and get the average delay time, sorted by the average delay time
grouped = example.groupby('OP_CARRIER')['DEP_DELAY'].mean().sort_values(ascending=False)

fig = px.bar(grouped, 
             x=grouped.index, 
             y=grouped.values,
             labels={'DEP_DELAY':'Delay time'}, 
             color='DEP_DELAY', 
             color_continuous_scale=["#04364A", "#64CCC5"])
fig.update_coloraxes(showscale=False)
fig.update_layout(
    xaxis_title_text='Airline',
    yaxis_title_text='Average delay time (minutes)',
    font_family="Serif",
    font_size=12,
    xaxis_tickangle=90,
    margin=dict(l=0, r=0, t=0, b=0),

)
fig.show()
fig.write_image("images/average_delay_time_per_airline.pdf")


Spirit Air Lines(NK) has the highest average delay time of 13.17 minutes, while Hawaiian Airlines(HA) has the lowest average delay time of -1.55 minutes (take off earlier than scheduled time).

## Pie Chart of Number of flights by airline

In [5]:
# put airlines with less than 1% of the total flights into the "other" category

temp = example.copy()

airline_counts = example['OP_CARRIER'].value_counts(normalize=True)

other_airlines = airline_counts[airline_counts < 0.01].index
temp.loc[temp['OP_CARRIER'].isin(other_airlines), 'OP_CARRIER'] = 'Other'

# group by airline and get number of flights, sorted by the number of flights
grouped = temp['OP_CARRIER'].value_counts()
grouped = grouped.sort_values(ascending=False)
grouped = pd.concat([grouped.drop('Other'), pd.Series([grouped['Other']], index=['Other'])])

print(grouped.values)
# create a pie chart
fig = px.pie(grouped, 
             values=grouped.values, 
             names=grouped.index, 
             labels={'index':'Airline', 'values':'Number of flights'},
             color_discrete_sequence=px.colors.sequential.Tealgrn_r,
             )
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)
fig.show()
fig.write_image("images/number_of_flights_per_airline.pdf")

[9540 6502 5156 4953 4170 3903 2449 2246 1942 1273 1005  767  686  652
  633  581  512  948]


Southwest Airlines Co.(WN) has the highest amount of data available in the dataset

## Heatmap of origin and destination airports

In [6]:
# creating heatmap of origin and destination airports

# replacing the origin and destination airport codes with state code
airport_state = pd.read_csv('data/iata-icao.csv')
airport_state = airport_state.drop_duplicates(subset=['iata_code'])  # drop duplicate iata_codes

temp = example.copy()
temp['ORIGIN'] = temp['ORIGIN'].map(airport_state.set_index('iata_code')['iso_region'])
temp['ORIGIN'] = temp['ORIGIN'].str[3:]

temp['DEST'] = temp['DEST'].map(airport_state.set_index('iata_code')['iso_region'])
temp['DEST'] = temp['DEST'].str[3:]

# drop rows with missing values
temp = temp.dropna(subset=['ORIGIN', 'DEST'])

# group by origin and destination and get number of rows
origins = temp.groupby(['ORIGIN']).size().reset_index(name='counts')
destinations = temp.groupby(['DEST']).size().reset_index(name='counts')

# create chloropleth map of origin airports
fig = px.choropleth(origins, 
                    locations='ORIGIN', 
                    locationmode='USA-states', 
                    color='counts', 
                    labels={'counts':'Number of flights'},
                    color_continuous_scale=["#64CCC5", "#04364A"]
                    )
fig.update_layout(
    geo_scope='usa',
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/origin_airports.pdf")

Most of the flights in the dataset are domestic flights, therefore the chloropleth map for both origin and destination airports are similar. The top 5 origin airports are ATL, ORD, DFW, DEN, LAX, while the top 5 destination airports are ATL, ORD, DFW, DEN, LAX.

### Scatter plot of delay time vs distance

In [7]:
fig = px.scatter(example, 
                 x='DISTANCE', 
                 y='DEP_DELAY', 
                 labels={'DISTANCE': 'Distance (miles)', 'DEP_DELAY': 'Delay time '}, 
                 color='DEP_DELAY',
                 color_continuous_scale=["#64CCC5", "#04364A"]
)

fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/distance_vs_delay.pdf")
