In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio   
pio.kaleido.scope.mathjax = None

# Importing Dataset

Dataset Link - https://www.kaggle.com/datasets/yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018

In [3]:
dataset = pd.read_csv('data/processed/M1_final.csv')
dataset

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,DEST,DEP_DELAY,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,...,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,TAXI_OUT
0,11,1,5,B6,N828JB,CHS,-1,124,636,324,...,34,58,W,25,38,29.86,Fair / Windy,9,17,14
1,11,1,5,B6,N992JB,LAX,-7,371,2475,340,...,34,58,W,25,38,29.86,Fair / Windy,9,17,15
2,11,1,5,B6,N959JB,FLL,40,181,1069,301,...,34,58,W,25,38,29.86,Fair / Windy,9,17,22
3,11,1,5,B6,N999JQ,MCO,-2,168,944,345,...,34,58,W,25,38,29.86,Fair / Windy,9,17,12
4,11,1,5,DL,N880DN,ATL,-4,139,760,360,...,32,58,W,24,35,29.91,Fair / Windy,9,17,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28815,1,31,5,B6,N216JB,ORH,2,57,150,1370,...,38,96,N,6,0,30.18,Cloudy,20,32,19
28816,1,31,5,AA,N104NN,BOS,2,75,187,1390,...,38,96,N,6,0,30.18,Cloudy,19,23,22
28817,1,31,5,AS,N581AS,SEA,283,392,2422,1125,...,38,96,N,6,0,30.18,Cloudy,19,23,21
28818,1,31,5,B6,N957JB,SJU,5,224,1598,1417,...,38,96,N,6,0,30.18,Cloudy,19,23,13


## Columns

In [4]:
dataset.columns

Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM',
       'DEST', 'DEP_DELAY', 'CRS_ELAPSED_TIME', 'DISTANCE', 'CRS_DEP_M',
       'DEP_TIME_M', 'CRS_ARR_M', 'Temperature', 'Dew Point', 'Humidity',
       'Wind', 'Wind Speed', 'Wind Gust', 'Pressure', 'Condition', 'sch_dep',
       'sch_arr', 'TAXI_OUT'],
      dtype='object')

## Average time of delay grouped by airline

In [5]:
# group by airline and get the average delay time, sorted by the average delay time
grouped = dataset.groupby('OP_UNIQUE_CARRIER')['DEP_DELAY'].mean().sort_values(ascending=False)

fig = px.bar(grouped, 
             x=grouped.index, 
             y=grouped.values,
             labels={'DEP_DELAY':'Delay time'}, 
             color='DEP_DELAY', 
             color_continuous_scale=["#04364A", "#64CCC5"])
fig.update_coloraxes(showscale=False)
fig.update_layout(
    xaxis_title_text='Airline',
    yaxis_title_text='Average delay time (minutes)',
    font_family="Serif",
    font_size=12,
    xaxis_tickangle=90,
    margin=dict(l=0, r=0, t=0, b=0),

)
fig.show()
fig.write_image("images/average_delay_time_per_airline.pdf")


SkyWest Airlines Inc.(OO) has the highest average delay time of more than 16 minutes, while Hawaiian Airlines Inc.(HA) has the lowest average delay time of less than 1 minute.

## Pie Chart of Number of flights by airline

In [6]:
# put airlines with less than 1% of the total flights into the "other" category

temp = dataset.copy()

airline_counts = dataset['OP_UNIQUE_CARRIER'].value_counts(normalize=True)

other_airlines = airline_counts[airline_counts < 0.01].index
temp.loc[temp['OP_UNIQUE_CARRIER'].isin(other_airlines), 'OP_UNIQUE_CARRIER'] = 'Other'

# group by airline and get number of flights, sorted by the number of flights
grouped = temp['OP_UNIQUE_CARRIER'].value_counts()
grouped = grouped.sort_values(ascending=False)
grouped = pd.concat([grouped.drop('Other'), pd.Series([grouped['Other']], index=['Other'])])

# create a pie chart
fig = px.pie(grouped, 
             values=grouped.values, 
             names=grouped.index, 
             labels={'index':'Airline', 'values':'Number of flights'},
             color_discrete_sequence=px.colors.sequential.Tealgrn_r,
             )
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)
fig.show()
fig.write_image("images/number_of_flights_per_airline.pdf")

JetBlue Airways (B6) has the highest amount of data available in the dataset, followed by Delta Air Lines Inc.(DL)

## Heatmap of origin and destination airports

In [7]:
# creating heatmap of origin and destination airports

# replacing the origin and destination airport codes with state code
airport_state = pd.read_csv('data/iata-icao.csv')
airport_state = airport_state.drop_duplicates(subset=['iata_code'])  # drop duplicate iata_codes

temp = dataset.copy()
temp['DEST'] = temp['DEST'].map(airport_state.set_index('iata_code')['iso_region'])
temp['DEST'] = temp['DEST'].str[3:]

# drop rows with missing values
temp = temp.dropna(subset=['DEST'])

# group by origin and destination and get number of rows
destinations = temp.groupby(['DEST']).size().reset_index(name='counts')

# create chloropleth map of origin airports
fig = px.choropleth(destinations, 
                    locations='DEST', 
                    locationmode='USA-states', 
                    color='counts', 
                    labels={'counts':'Number of flights'},
                    color_continuous_scale=["#64CCC5", "#04364A"]
                    )
fig.update_layout(
    geo_scope='usa',
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/origin_airports.pdf")

We can see that the most of the flights go to California and Florida. 

### Scatter plot of delay time vs distance

In [8]:
fig = px.scatter(dataset, 
                 x='DISTANCE', 
                 y='DEP_DELAY', 
                 labels={'DISTANCE': 'Distance (miles)', 'DEP_DELAY': 'Delay time '}, 
                 color='DEP_DELAY',
                 color_continuous_scale=["#64CCC5", "#04364A"]
)

fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/distance_vs_delay.pdf")


## Histogram of Temperature vs Average Delay Time

In [9]:
# generating histogram of temperature vs avg delay time

grouped = dataset.groupby('Temperature')['DEP_DELAY'].mean().reset_index(name='avg_delay')

unique_temps = len(['Temperature'].unique())

fig = px.histogram(grouped,
                    nbins=int(unique_temps/4),
                    x='Temperature',
                    y='avg_delay',
                    labels={'Temperature': 'Temperature (F)', 'avg_delay': 'Delay time (minutes)'},
                    color_discrete_sequence=['#64CCC5'],
    )

fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/temperature_vs_delay.pdf")

We can conclude from this graph that as we skew more towards the edges of the graph, either extremely cold or extremely hot weather conditions, the average delay time increases. 

## Correlation matrix between different types of weather conditions

In [10]:
corr_matrix = dataset[['Wind Gust', 'Wind Speed','Temperature','Pressure','Dew Point','Humidity']].corr()

fig = px.imshow(corr_matrix,
                labels={'x':'Weather', 'y':'Weather', 'color':'Correlation'},
                color_continuous_scale=px.colors.sequential.Viridis_r
                )

fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()
fig.write_image("images/weather_correlation.pdf")


We can see most of the weather conditions are not correlated with each other. The only three that are correlated are Dew Point and Temperature, Dew Point and Pressure, and Wind Speed and Wind Gust.

## Temperature vs Average Delay

In [11]:
# destination airport vs delay time

grouped = dataset.groupby('DEST')['DEP_DELAY'].mean().reset_index(name='avg_delay').sort_values(by='avg_delay', ascending=False)

fig = px.bar(grouped,
                x='DEST',
                y='avg_delay',
                labels={'DEST':'Destination airport', 'avg_delay':'Delay time (minutes)'},
                color='avg_delay',
                color_continuous_scale=["#04364A", "#64CCC5"]
                )

fig.update_layout(
    font_family="Serif",
    font_size=12,
    margin=dict(l=0, r=0, t=0, b=0),
)

fig.show()