In [None]:
# Import dependencies
import pandas as pd
import zipfile
import numpy as np
from pathlib import Path


Extract Zipfile in Resources and write the contents to the directory

In [None]:
# Open the zip file
with zipfile.ZipFile('Resources/dot-airline-on-time-performance-statistics.zip', 'r') as zip_ref:
    # Extract all the contents into the extraction directory
    zip_ref.extractall('Resources')


# Airports Data

In [None]:
# Read the airports data into a Pandas DataFrame
airports_df = pd.read_csv('Resources/dot-airline-on-time-performance-statistics/Airports')
airports_df.head()


In [None]:
# function to split Description column on the colon
def splits1(x):
    return pd.Series(x.split(":", 1))

# apply the function
airports_df[['Location', 'Airport_Name']] = airports_df['Description'].apply(splits1)
airports_df.head()

In [None]:
# function to split Location column on the comma
def splits2(x):
    return pd.Series(x.split(",", 1))

# apply the function
airports_df[['City', 'State']] = airports_df['Location'].apply(splits2)
airports_df.head()

In [None]:
# dropping old columns
airports_df.drop(columns=["Description", "Location"], inplace=True)
airports_df.head()

In [None]:
# Drop the null value
airports_df.dropna(inplace=True)

In [None]:
# validate it dropped
airports_df.tail()

In [None]:
# fix column name formatting and correct State to location
airports_df.rename(columns={"Code": "code", "Airport_Name": "airport_name", "City": "city", "State": "location"},inplace=True,)
airports_df.tail()

In [None]:
# check data types for database entry
airports_df.dtypes

In [None]:
# Export to CSV for upload to database
filepath = Path('./Updated_CSVs/airports.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
airports_df.to_csv(filepath, index=False)  

# Air Carriers Data

In [None]:
# Read Air Carriers Data into pandas dataframe
aircarriers_df = pd.read_csv('Resources/dot-airline-on-time-performance-statistics/Air Carriers')
aircarriers_df.head()

In [None]:
# function to split Description column on the colon
def splits3(x):
    return pd.Series(x.split(": ", 1))

# apply the function
aircarriers_df[['Company', 'Prefix']] = aircarriers_df['Description'].apply(splits3)
aircarriers_df.head()

In [None]:
# dropping old columns
aircarriers_df.drop(columns=["Description"], inplace=True)
aircarriers_df.head()

In [None]:
# fix column name formatting
aircarriers_df.rename(columns={"Code": "code", "Company": "company", "Prefix": "prefix"},inplace=True,)
aircarriers_df.head()

In [None]:
# check data types for database entry
aircarriers_df.dtypes

In [None]:
# Export to CSV for upload to database
filepath2 = Path('./Updated_CSVs/aircarriers.csv')  
filepath2.parent.mkdir(parents=True, exist_ok=True)  
aircarriers_df.to_csv(filepath2, index=False)  

# August 2018 Nationwide Data

In [None]:
# Read Airline Performance Nationwide Stats for August 2018
aug2018_df = pd.read_csv('Resources/dot-airline-on-time-performance-statistics/August 2018 Nationwide.csv')
aug2018_df.head()


In [None]:
aug2018_df.columns

In [None]:
# Drop
# TAIL_NUM, ORIGIN_AIRPORT_ID, ORIGIN_AIRPORT_SEQ_ID, ORIGIN_CITY_MARKET_ID, DEST_AIRPORT_ID, DEST_AIRPORT_SEQ_ID, DEST_CITY_MARKET_ID, DEP_DELAY_NEW, ARR_DELAY_NEW, CANCELLATION_CODE, CRS_ELAPSED_TIME, ACTUAL_ELAPSED_TIME, Unnamed: 28

# Keep
# 'FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'

# NAS - National Airspace System
# CRS - Computer Reservation System

#  FL_DATE change to datetype

In [None]:
# new dataframe with the needed columns
newaug2018_df = aug2018_df[['FL_DATE', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
                            'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']].copy()
newaug2018_df.head()

In [None]:
# udpate colums to all lowercase to match the other dataframes
newaug2018_df.columns = newaug2018_df.columns.str.lower()
newaug2018_df.head()

In [None]:
# add index column header and start index at 1
newaug2018_df.index.name='id'
newaug2018_df.index = np.arange(1, len(newaug2018_df) + 1)
newaug2018_df.head()

In [None]:
# checking data types
newaug2018_df.dtypes

In [None]:
# correct the date column to datetime
newaug2018_df['fl_date'] = pd.to_datetime(newaug2018_df['fl_date'])
newaug2018_df.dtypes

In [None]:
# fill in empty spaces with zeros
newaug2018_df['dep_time'] = newaug2018_df['dep_time'].fillna(0)
newaug2018_df['dep_delay'] = newaug2018_df['dep_delay'].fillna(0)
newaug2018_df['arr_time'] = newaug2018_df['arr_time'].fillna(0)
newaug2018_df['arr_delay'] = newaug2018_df['arr_delay'].fillna(0)

In [None]:
# update columns to integers
newaug2018_df['dep_time'] = pd.to_numeric(newaug2018_df['dep_time'], downcast ='signed')
newaug2018_df['dep_delay'] = pd.to_numeric(newaug2018_df['dep_delay'], downcast ='signed')
newaug2018_df['arr_time'] = pd.to_numeric(newaug2018_df['arr_time'], downcast ='signed')
newaug2018_df['arr_delay'] = pd.to_numeric(newaug2018_df['arr_delay'], downcast ='signed')
newaug2018_df.dtypes

In [None]:
# Export to CSV for upload to database
filepath3 = Path('./Updated_CSVs/aug2018.csv')  
filepath3.parent.mkdir(parents=True, exist_ok=True)  
newaug2018_df.to_csv(filepath3)  