#### Import dependencies

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

### Import Airline data

In [2]:
# Use this cell for sampling

# messy_flight_df = pd.read_csv('Resources/Flights_2022_1.csv')

In [3]:
# Use this cell for production

messy_flight_df = pd.DataFrame()

# loop through csv files 
for x in range(1,8):
    file = Path(f'Resources/Flights_2022_{x}.csv')
    import_df = pd.read_csv(file,low_memory=False,engine='c')
    messy_flight_df = pd.concat([messy_flight_df,import_df],ignore_index=True)

### Format the master csv

In [4]:
# Function to clean the values in certain columns to a readable military datetime format

def mil_time_converter (series):
    # converts the string into a military time format
    series = series.apply(lambda x: f'{(int(x) // 100):02d}:{(int(x) % 100):02d}' if pd.notnull(x) else x)
    # Convert string to correct midnight format
    series = series.apply(lambda x: '00:00' if x == '24:00' else x)
    # Converts values into datetime object
    series = series.apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M') if pd.notnull(x) else x)
    return series

In [5]:
# Convert Arrival and Departure times to military format

messy_flight_df['ArrTime'] =mil_time_converter(messy_flight_df['ArrTime'])
messy_flight_df['DepTime'] =mil_time_converter(messy_flight_df['DepTime'])


In [6]:
# Convert FlightDate to datetime object

messy_flight_df['FlightDate'] = messy_flight_df['FlightDate'].apply(lambda x: pd.to_datetime(x) if pd.notnull(x) else x)

In [7]:
# Making new binary column that tracks delay for both arr and dep

arr_dep_delayed = (messy_flight_df['ArrDelayMinutes']>0) | (messy_flight_df['DepDelayMinutes']>0)

messy_flight_df['Delayed'] = np.where(arr_dep_delayed, 1, 0)

In [8]:
# Creates an integer coded target column for the Machine learning model 

on_time = (messy_flight_df['Delayed']==0) & (messy_flight_df['Cancelled']==0)
delayed = messy_flight_df['Delayed']==1
cancelled = messy_flight_df['Cancelled']==1

messy_flight_df['Target'] = np.where(on_time, '0', np.where(delayed,'1', np.where(cancelled,'2','')))

#### Export/Import combined data csv

In [9]:
# USE IF NECESSARY
# Export combined Dataframe

# export_path = Path('Resources/Flights_2022_Master.csv')
# messy_flight_df.to_csv(export_path,index=False)

In [10]:
# USE IF NECESSARY
# Import combined dataframe

# messy_import_path = Path('Resources/Flights_2022_Master.csv')
# messy_flight_df = pd.read_csv(messy_import_path,low_memory=False)

### Create, Format, and Export Table DataFrames

#### Airlines Table

In [11]:
# import airlines csv from resources

airlines_df_path = Path('Resources/Airlines.csv')
airlines_cleanup_df = pd.read_csv(airlines_df_path)

In [12]:
# Rename columns

airlines_cleanup_df.rename(columns={
    'Code':'Airline_Code',
    'Description':'Airline_Name'},inplace=True)

In [13]:
# Confirm DB changes

airlines_cleanup_df

Unnamed: 0,Airline_Code,Airline_Name
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.
...,...,...
1566,ZW,Air Wisconsin Airlines Corp
1567,ZX,Air Georgian
1568,ZX (1),Airbc Ltd.
1569,ZY,Atlantic Gulf Airlines


In [14]:
# Export cleaned Dataframe to csv to use as Table

cleaned_airlines_path = Path('Resources/DB_Airline_Table.csv')
airlines_cleanup_df.to_csv(cleaned_airlines_path)

#### Flights Table

In [15]:
# create a list of columns from messy_flight_df to make Flight DF

flights_columns_list = [
    'Flight_Number_Operating_Airline',
    'Operating_Airline ',
    'Origin',
    'Dest',
    'FlightDate',
    'CRSDepTime', 
    'CRSArrTime',
    'Delayed',
    'Cancelled',
    'Target'
    ]

In [16]:
# Create a Flights DF

Flights_df = messy_flight_df[flights_columns_list]

In [17]:
# Use for checkpoint creation

# # create a path for the Flight Dataframe 
# flight_df_path = Path('Resources/Flights_Table.csv')
# # Export the dataframe to a csv
# Flights_df.to_csv(flight_df_path)

##### DB Prep

In [18]:
# Either imports if starting from new session or makes a copy of DF for cleaning
try:
    if Flights_df is not None:
        flight_cleanup_df = Flights_df
except NameError:
    # will only run if checkpoint is created
    flight_df_path = Path('Resources/Flights_Table.csv')
    flight_cleanup_df = pd.read_csv(flight_df_path,index_col=0)

In [19]:
# Rename Columns
flight_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'Operating_Airline ':'Operating_Airline',
    'Origin':'Origin_Airport',
    'Dest':'Destination_Airport',
    'FlightDate':'Flight_Date',
    'CRSDepTime':'Scheduled_Departure_Time',
    'CRSArrTime':'Scheduled_Arrival_Time'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [20]:
# Confirm changes for DB

flight_cleanup_df

Unnamed: 0,Airline_Flight_Number,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Departure_Time,Arrival_Time,Delayed,Cancelled,Target
0,1581,DL,FLL,LGA,2022-01-06,,,0,1.0,2
1,1582,DL,ATL,FLL,2022-01-06,16:27,18:20,0,0.0,0
2,1582,DL,FLL,ATL,2022-01-06,19:29,21:15,0,0.0,0
3,1583,DL,FLL,RDU,2022-01-06,10:19,12:12,0,0.0,0
4,1584,DL,ATL,JAN,2022-01-06,11:13,11:31,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...
4078313,3406,YX,ALB,EWR,2022-07-01,15:16,16:30,1,0.0,1
4078314,3405,YX,AVL,EWR,2022-07-01,12:36,14:28,0,0.0,0
4078315,3403,YX,ALB,EWR,2022-07-01,11:53,13:33,1,0.0,1
4078316,3401,YX,BNA,EWR,2022-07-01,14:58,18:43,1,0.0,1


In [21]:
# Export cleaned Dataframe to csv to use as Table

cleaned_flight_path = Path('Resources/DB_Flights_Table.csv')
flight_cleanup_df.to_csv(cleaned_flight_path)

#### Delayed Flights Table

In [22]:
# create a list of columns from messy_flight_df to make Delayed Flight DF


delayed_columns_list = [
    'Flight_Number_Operating_Airline',
    'FlightDate',
    'CRSArrTime',
    'ArrDelayMinutes',
    'CRSDepTime',
    'DepDelayMinutes',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
]

In [23]:
# Create the Delayed Flights DF

delayed = messy_flight_df['Delayed']==1
Delayed_Flights_df = messy_flight_df.loc[delayed][delayed_columns_list]

In [24]:
# Use for checkpoint creation

# # Create Delayed Flights DF path
# delayed_df_path = Path('Resources/Delayed_Flights_Table.csv')
# # Export the DF to a csv
# Delayed_Flights_df.to_csv(delayed_df_path)

##### DB Prep

In [25]:
# Either imports if starting from new session or makes a copy of DF for cleaning
try:
    if Delayed_Flights_df is not None:
        delay_cleanup_df = Delayed_Flights_df
except NameError:
    # will only run if checkpoint is created
    delayed_df_path = Path('Resources/Delayed_Flights_Table.csv')
    delay_cleanup_df = pd.read_csv(delayed_df_path,index_col=0)

In [26]:
# Create a list of columns that need converting to a datetime format

time_interval_columns = [
    'ArrDelayMinutes',
    'DepDelayMinutes',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
    ]

In [27]:
# Format those columns as a time delta

for col in time_interval_columns:
    delay_cleanup_df[col]=pd.to_timedelta(delay_cleanup_df[col],unit='m')
    # Some times convert to NaT - fill values to 0 days
    delay_cleanup_df[col] = delay_cleanup_df[col].fillna(pd.Timedelta(seconds=0))

In [28]:
# Rename Columns
delay_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'FlightDate':'Flight_Date',
    'CRSArrTime':'Scheduled_Arrival_Time',
    'ArrDelayMinutes':'Arrival_Delayed_Minutes',
    'CRSDepTime':'Scheduled_Departure_Time',
    'DepDelayMinutes':'Departure_Delayed_Minutes',
    'CarrierDelay':'Carrier_Delay_Minutes',
    'WeatherDelay':'Weather_Delay_Minutes',
    'NASDelay':'NAS_Delay_Minutes',
    'SecurityDelay':'Security_Delay_Minutes',
    'LateAircraftDelay':'Late_Aircraft_Delay_Minutes'},
    inplace=True)

In [29]:
# Confirm DB changes

delay_cleanup_df

Unnamed: 0,Airline_Flight_Number,Flight_Date,Arrival_Time,Arrival_Delayed_Minutes,Departure_Time,Departure_Delayed_Minutes,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes
7,1586,2022-01-06,14:35,0 days 00:02:00,10:18,0 days 00:18:00,0 days 00:00:00,0 days,0 days 00:00:00,0 days,0 days 00:00:00
8,1587,2022-01-06,15:31,0 days 00:00:00,14:22,0 days 00:08:00,0 days 00:00:00,0 days,0 days 00:00:00,0 days,0 days 00:00:00
17,1593,2022-01-06,10:22,0 days 00:08:00,06:56,0 days 00:00:00,0 days 00:00:00,0 days,0 days 00:00:00,0 days,0 days 00:00:00
19,1595,2022-01-06,12:14,0 days 00:40:00,10:46,0 days 00:51:00,0 days 00:02:00,0 days,0 days 00:00:00,0 days,0 days 00:38:00
21,1597,2022-01-06,22:30,0 days 00:07:00,20:58,0 days 00:03:00,0 days 00:00:00,0 days,0 days 00:00:00,0 days,0 days 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...
4078312,3408,2022-07-01,18:18,0 days 01:16:00,17:25,0 days 00:55:00,0 days 00:00:00,0 days,0 days 01:16:00,0 days,0 days 00:00:00
4078313,3406,2022-07-01,16:30,0 days 00:18:00,15:16,0 days 00:16:00,0 days 00:00:00,0 days,0 days 00:18:00,0 days,0 days 00:00:00
4078315,3403,2022-07-01,13:33,0 days 00:20:00,11:53,0 days 00:00:00,0 days 00:00:00,0 days,0 days 00:20:00,0 days,0 days 00:00:00
4078316,3401,2022-07-01,18:43,0 days 00:43:00,14:58,0 days 00:28:00,0 days 00:00:00,0 days,0 days 00:43:00,0 days,0 days 00:00:00


In [30]:
# Export cleaned Dataframe to csv to use as Table

cleaned_delay_path = Path('Resources/DB_Delayed_Flights_Table.csv')
delay_cleanup_df.to_csv(cleaned_delay_path)

#### Airport Codes Table

In [31]:
# create a list of columns from messy_flight_df to make Airport Codes DF

airport_codes_list = [
    'Origin',
    'OriginCityName',
]

In [32]:
# Create the Airport Codes DF

Airport_Codes_df = messy_flight_df[airport_codes_list]

In [33]:
# Use for checkpoint creation

# # Create a path for the Airport Codes DF
# airport_codes_df_path = Path('Resources/Airport_Codes_Table.csv')
# # Export the dataframe to a csv
# Airport_Codes_df.to_csv(airport_codes_df_path)

##### DB Prep

In [34]:
# Either imports if starting from new session or makes a copy of DF for cleaning
try:
    if Airport_Codes_df is not None:
        airport_codes_cleanup_df = Airport_Codes_df
except NameError:
    # will only run if checkpoint is created
    airport_codes_df_path = Path('Resources/Airport_Codes_Table.csv')
    airport_codes_cleanup_df = pd.read_csv(airport_codes_df_path,index_col=0)

In [35]:
# get only the Unique Origin Airport Codes
airport_codes_cleanup_df.drop_duplicates(subset=['Origin'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [36]:
# Split City and State to create columns for Table

airport_codes_cleanup_df[['Airport_City','Airport_State']] = airport_codes_cleanup_df['OriginCityName'].str.split(', ',expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [37]:
# Remove unnecesary Column

airport_codes_cleanup_df.drop(columns=['OriginCityName'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [38]:
# Clean the index

airport_codes_cleanup_df.reset_index(drop=True,inplace=True)

In [39]:
# Rename Columns

airport_codes_cleanup_df.rename(columns={'Origin':'Airport_Code'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [40]:
# Confirm DB changes

airport_codes_cleanup_df

Unnamed: 0,Airport_Code,Airport_City,Airport_State
0,FLL,Fort Lauderdale,FL
1,ATL,Atlanta,GA
2,JAN,Jackson/Vicksburg,MS
3,RIC,Richmond,VA
4,MSP,Minneapolis,MN
...,...,...,...
370,ACK,Nantucket,MA
371,GST,Gustavus,AK
372,HYA,Hyannis,MA
373,MVY,Martha's Vineyard,MA


In [41]:
# Export cleaned Dataframe to csv to use as Table

cleaned_airport_codes_path = Path('Resources/DB_Airport_Codes.csv')
airport_codes_cleanup_df.to_csv(cleaned_airport_codes_path)

#### Cancelled Flights Table

In [42]:
# create a list of columns from messy_flight_df to make Cancelled Flights DF

cancelled_flights_list = [
    'Flight_Number_Operating_Airline',
    'Operating_Airline ',
    'Origin',
    'Dest',
    'FlightDate',
    'CancellationCode'
]

In [43]:
# Create the Cancelled Flights DF

cancelled = messy_flight_df['Cancelled']==1

Cancelled_Flights_df = messy_flight_df.loc[cancelled][cancelled_flights_list]

In [44]:
# Use me to create checkpoint

# # Create a path for the Cancelled Flights DF
# cancelled_df_path = Path('Resources/Cancelled_Flights_Table.csv')
# # Export DF to csv
# Cancelled_Flights_df.to_csv(cancelled_df_path)

##### DB Prep

In [45]:
# Either imports if starting from new session or makes a copy of DF for cleaning

try:
    if Cancelled_Flights_df is not None:
        cancelled_cleanup_df = Cancelled_Flights_df
except NameError:
    # will only run if checkpoint is created 
    cancelled_df_path = Path('Resources/Cancelled_Flights_Table.csv')
    cancelled_cleanup_df = pd.read_csv(cancelled_df_path,index_col=0)

In [46]:
# Rename Columns

cancelled_cleanup_df.rename(columns={'Flight_Number_Operating_Airline':'Airline_Flight_Number',
                                     'Operating_Airline ':'Operating_Airline',
                                     'Origin':'Origin_Airport',
                                     'Dest':'Destination_Airport',
                                     'CancellationCode':'Cancellation_Code',
                                     'FlightDate':'Flight_Date'},
                            inplace=True)

In [47]:
# Confirm DB changes

cancelled_cleanup_df

Unnamed: 0,Airline_Flight_Number,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Cancellation_Code
0,1581,DL,FLL,LGA,2022-01-06,A
30,1605,DL,DTW,BNA,2022-01-06,B
75,1650,DL,CVG,BOS,2022-01-06,B
148,1722,DL,MIA,LGA,2022-01-06,B
157,1729,DL,ATL,BNA,2022-01-06,B
...,...,...,...,...,...,...
4078270,3463,YX,CVG,EWR,2022-07-01,C
4078285,3443,YX,EWR,CMH,2022-07-01,C
4078290,3434,YX,BUF,EWR,2022-07-01,C
4078301,3421,YX,BGR,EWR,2022-07-01,C


In [48]:
# Export cleaned Dataframe to csv to use as Table

cancelled_flight_path = Path('Resources/DB_Cancelled_Flights_Table.csv')
cancelled_cleanup_df.to_csv(cancelled_flight_path)