In [2]:
# Import libraries
import pandas as pd

# Import csv files, put in dataframe
df_2018 = pd.read_csv('delays_2018.csv')
df_2019 = pd.read_csv('delays_2019.csv')

# Note: Put file path if csv is in different folder than notebook
# Example: df_2018 = pd.read_csv('C:\Users\User\Downloads\delays_2018.csv')

In [3]:
# Inspect the first 3 rows of the dataframe. Default is 5
df_2018.head(n=3)

Unnamed: 0,date,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-1,MQ,Envoy Air,BIS,"Bismarck/Mandan, ND: Bismarck Municipal",5.0,3.0,1.0,0.06,1.94,0.0,0.0,0.0,0.0,104.0,54.0,1.0,49.0,0.0,0.0
1,2018-1,MQ,Envoy Air,BNA,"Nashville, TN: Nashville International",110.0,21.0,7.17,1.16,6.76,0.0,5.92,3.0,0.0,897.0,344.0,37.0,226.0,0.0,290.0
2,2018-1,MQ,Envoy Air,BOI,"Boise, ID: Boise Air Terminal",32.0,8.0,0.22,0.35,5.61,0.0,1.82,0.0,0.0,353.0,9.0,18.0,233.0,0.0,93.0


In [4]:
df_2019.head()

Unnamed: 0,date,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2019-1,MQ,Envoy Air,SAV,"Savannah, GA: Savannah/Hilton Head International",65.0,15.0,3.41,0.71,4.33,0.0,6.56,1.0,1.0,601.0,180.0,29.0,129.0,0.0,263.0
1,2019-1,MQ,Envoy Air,SDF,"Louisville, KY: Louisville Muhammad Ali Intern...",61.0,18.0,2.7,1.01,8.93,0.0,5.37,1.0,0.0,890.0,180.0,36.0,383.0,0.0,291.0
2,2019-1,MQ,Envoy Air,SGF,"Springfield, MO: Springfield-Branson National",428.0,80.0,13.31,5.18,27.42,0.0,34.09,15.0,0.0,3954.0,705.0,213.0,982.0,0.0,2054.0
3,2019-1,MQ,Envoy Air,SHV,"Shreveport, LA: Shreveport Regional",174.0,28.0,5.97,1.17,11.15,0.0,9.72,0.0,0.0,1655.0,360.0,55.0,268.0,0.0,972.0
4,2019-1,MQ,Envoy Air,SJT,"San Angelo, TX: San Angelo Regional/Mathis Field",135.0,23.0,10.78,0.35,6.54,0.0,5.33,2.0,0.0,835.0,320.0,27.0,192.0,0.0,296.0


In [24]:
# Combine the 2 dataframes. First parameter is a list of the dataframes, second parameter is to
# ignore existing indexes
df = pd.concat([df_2018, df_2019], ignore_index=True)

In [6]:
print("Rows in 2018 df:", str(len(df_2018)))
print("Rows in 2019 df:", str(len(df_2019)))
print("Rows in combined df:", str(len(df)))

Rows in 2018 df: 20231
Rows in 2019 df: 20946
Rows in combined df: 41177


# Inspect and Clean Dataset

Examples of invalid data:
 - Data outside of of the 2018-2019 time period
 - Empty cells in any of the columns (arr_flights, airport, carrier, etc)
 - 

In [28]:
nanAirportValues = df[df['airport'].isna()]
print ("Number of rows with nan airport values:", len(nanAirportValues))
#print (nanAirportValues)

nanCarrierValues = df[df['carrier'].isna()]
print ("Number of rows with nan carrier values:", len(nanCarrierValues))
#print (nanCarrierValues)

nanArrivalValues = df[df['arr_flights'].isna()]
print ("Number of rows with nan arr_flights values:", len(nanArrivalValues))


Number of rows with nan airport values: 19
Number of rows with nan carrier values: 30
Number of rows with nan arr_flights values: 31


In [32]:
# Remove rows with missing values in the airport, carrier columns.
# subset parameter: pass a list of column names
# inPlace parameter: modify original dataframe instead of creating copy 
df.dropna(subset=["airport", "carrier", "arr_flights"], inplace=True)

# Alternate command: 
# df = df[df['airport'].notna()]

print ("Number of rows with nan airport values:", len(df[df['airport'].isna()]))

Number of rows with nan airport values: 0
Number of rows with nan carrier values: 0
Number of rows with nan arr_flights values: 0


In [44]:
#Give me all rows with nan values in any column
nan_rows = df[df.isna().any(axis=1)]
print(len(nan_rows))
nan_rows.head(n=15)

# Different ways Around this:
#   1. Replace with 0 
#   2. If only few empty cells, can outright remove rows
#   3. Replace nan with with mean/median/mode

df['arr_del15'].fillna(0, inplace=True)
nan_rows = df[df.isna().any(axis=1)]
print(len(nan_rows))



0
0


In [8]:

df['date'] = pd.to_datetime(df['date'], format='%Y-%m').dt.strftime(('%Y-%m'))
df.head()


Unnamed: 0,date,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-01,MQ,Envoy Air,BIS,"Bismarck/Mandan, ND: Bismarck Municipal",5.0,3.0,1.0,0.06,1.94,0.0,0.0,0.0,0.0,104.0,54.0,1.0,49.0,0.0,0.0
1,2018-01,MQ,Envoy Air,BNA,"Nashville, TN: Nashville International",110.0,21.0,7.17,1.16,6.76,0.0,5.92,3.0,0.0,897.0,344.0,37.0,226.0,0.0,290.0
2,2018-01,MQ,Envoy Air,BOI,"Boise, ID: Boise Air Terminal",32.0,8.0,0.22,0.35,5.61,0.0,1.82,0.0,0.0,353.0,9.0,18.0,233.0,0.0,93.0
3,2018-01,MQ,Envoy Air,BPT,"Beaumont/Port Arthur, TX: Jack Brooks Regional",63.0,11.0,1.75,1.08,2.98,0.0,5.19,3.0,0.0,657.0,83.0,34.0,130.0,0.0,410.0
4,2018-01,MQ,Envoy Air,BUF,"Buffalo, NY: Buffalo Niagara International",31.0,12.0,0.82,3.0,6.62,0.0,1.55,0.0,0.0,484.0,27.0,136.0,207.0,0.0,114.0
