### Import dependencies

In [1]:
import pandas as pd
from path import Path

### Import Airline data

In [2]:
raw_2022_flights = pd.DataFrame()

# loop through csv files 
for x in range(1,8):
    file = Path(f'Resources/Flights_2022_{x}.csv')
    import_df = pd.read_csv(file,low_memory=False)
    raw_2022_flights = pd.concat([raw_2022_flights,import_df],ignore_index=True)

#### Export combined data into new csv

In [3]:
export_path = Path('Resources/Flights_2022_Master.csv')
raw_2022_flights.to_csv(export_path,index=False)

#### Re-Import master csv

In [4]:
messy_import_path = Path('Resources/Flights_2022_Master.csv')
messy_flight_df = pd.read_csv(messy_import_path,low_memory=False)

### Separate master dataframe into each table

#### Airlines DF

In [46]:
airlines_df_path = Path('Resources/Airlines.csv')
Airlines_df = pd.read_csv(airlines_df_path)

#### Flights DF

In [47]:
flights_columns_list = [
    'Flight_Number_Operating_Airline',
    'Operating_Airline ',
    'Origin',
    'Dest',
    'FlightDate',
    'DepTime', # change from military to datetime
    'ArrTime', # change from military to datetime 
    'ArrDel15',
    'Cancelled'
    ]

In [51]:
flight_df_path = Path('Resources/Flights_Table.csv')
Flights_df = messy_flight_df[flights_columns_list]
Flights_df.to_csv(flight_df_path)

#### Delayed Flights DF

In [48]:
delayed_columns_list = [
    'Flight_Number_Operating_Airline',
    'ArrTime',
    'ArrDel15',
    'DepTime', # change from military to datetime
    'DepDel15',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
]

In [52]:
delayed_df_path = Path('Resources/Delayed_Flights_Table.csv')
Delayed_Flights_df = messy_flight_df[delayed_columns_list]
Delayed_Flights_df.to_csv(delayed_df_path)

#### Airport Codes DF

In [49]:
airport_codes_list = [
    'Origin',
    'OriginCityName',
    'OriginState'
]

In [53]:
airport_codes_df_path = Path('Resources/Airport_Codes_Table.csv')
Airport_Codes_df = messy_flight_df[airport_codes_list]
Airport_Codes_df.to_csv(airport_codes_df_path)

#### Cancelled Flights DF

In [50]:
cancelled_flights_list = [
    'Flight_Number_Operating_Airline',
    'CancellationCode'
]

In [54]:
cancelled_df_path = Path('Resources/Cancelled_Flights_Table.csv')
Cancelled_Flights_df = messy_flight_df.loc[messy_flight_df['Cancelled']==1][cancelled_flights_list]
Cancelled_Flights_df.to_csv(cancelled_df_path)

## Format Data

#### Flight Table Cleaning

In [295]:
# Import Table Data
flight_cleanup_df = pd.read_csv(flight_df_path,index_col=0)


In [297]:
flight_cleanup_df

Unnamed: 0,Flight_Number_Operating_Airline,Operating_Airline,Origin,Dest,FlightDate,DepTime,ArrTime,ArrDel15,Cancelled
1,1582,DL,ATL,FLL,2022-01-06,1627.0,1820.0,0.0,0.0
2,1582,DL,FLL,ATL,2022-01-06,1929.0,2115.0,0.0,0.0
3,1583,DL,FLL,RDU,2022-01-06,1019.0,1212.0,0.0,0.0
4,1584,DL,ATL,JAN,2022-01-06,1113.0,1131.0,0.0,0.0
5,1584,DL,JAN,ATL,2022-01-06,1230.0,1435.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4078313,3406,YX,ALB,EWR,2022-07-01,1516.0,1630.0,1.0,0.0
4078314,3405,YX,AVL,EWR,2022-07-01,1236.0,1428.0,0.0,0.0
4078315,3403,YX,ALB,EWR,2022-07-01,1153.0,1333.0,1.0,0.0
4078316,3401,YX,BNA,EWR,2022-07-01,1458.0,1843.0,1.0,0.0


In [296]:
#Drop rows with NaN values in Arrival time from the  Flights Table
flight_cleanup_df.dropna(subset=['ArrTime'],inplace=True)


In [298]:
# Format the ArrTime Column

# converts the string into a military time format
flight_cleanup_df['ArrTime'] = flight_cleanup_df['ArrTime'].apply(lambda x: '{:02d}:{:02d}'.format(int(x) // 100, int(x) % 100))
# Convert string to correct midnight format
flight_cleanup_df['ArrTime']=flight_cleanup_df['ArrTime'].apply(lambda x: '00:00' if x == '24:00' else x)
# Converts values into datetime object
flight_cleanup_df['ArrTime'] =flight_cleanup_df['ArrTime'].apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M'))

In [300]:
# Format the DepTime Column

# converts the string into a military time format
flight_cleanup_df['DepTime'] = flight_cleanup_df['DepTime'].apply(lambda x: '{:02d}:{:02d}'.format(int(x) // 100, int(x) % 100))
# Convert string to correct midnight format
flight_cleanup_df['DepTime']=flight_cleanup_df['DepTime'].apply(lambda x: '00:00' if x == '24:00' else x)
# Converts values into datetime object
flight_cleanup_df['DepTime'] =flight_cleanup_df['DepTime'].apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M'))

In [382]:
flight_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'Operating_Airline ':'Operating_Airline',
    'Origin':'Origin_Airport',
    'Dest':'Destination_Airport',
    'FlightDate':'Flight_Date',
    'DepTime':'Departure_Time',
    'ArrTime':'Arrival_Time',
    'ArrDel15':'Arrival_Delayed',
    'Cancelled':'Cancelled'},inplace=True)

In [383]:
flight_cleanup_df

Unnamed: 0,Airline_Flight_Number,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Departure_Time,Arrival_Time,Arrival_Delayed,Cancelled
1,1582,DL,ATL,FLL,2022-01-06,16:27,18:20,0.0,0.0
2,1582,DL,FLL,ATL,2022-01-06,19:29,21:15,0.0,0.0
3,1583,DL,FLL,RDU,2022-01-06,10:19,12:12,0.0,0.0
4,1584,DL,ATL,JAN,2022-01-06,11:13,11:31,0.0,0.0
5,1584,DL,JAN,ATL,2022-01-06,12:30,14:35,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4078313,3406,YX,ALB,EWR,2022-07-01,15:16,16:30,1.0,0.0
4078314,3405,YX,AVL,EWR,2022-07-01,12:36,14:28,0.0,0.0
4078315,3403,YX,ALB,EWR,2022-07-01,11:53,13:33,1.0,0.0
4078316,3401,YX,BNA,EWR,2022-07-01,14:58,18:43,1.0,0.0


#### Delayed Flight Table Cleaning

In [318]:
# Import Table Data
delay_cleanup_df = pd.read_csv(delayed_df_path,index_col=0)


In [312]:
# Filter for delayed flights
delay_cleanup_df = delay_cleanup_df.loc[delay_cleanup_df['ArrDel15']>0]
delay_cleanup_df = delay_cleanup_df.loc[delay_cleanup_df['DepDel15']>0]

In [None]:
# Format the ArrTime Column

# converts the string into a military time format
delay_cleanup_df['ArrTime'] = delay_cleanup_df['ArrTime'].apply(lambda x: '{:02d}:{:02d}'.format(int(x) // 100, int(x) % 100))
# Convert string to correct midnight format
delay_cleanup_df['ArrTime']=delay_cleanup_df['ArrTime'].apply(lambda x: '00:00' if x == '24:00' else x)
# Converts values into datetime object
delay_cleanup_df['ArrTime'] =delay_cleanup_df['ArrTime'].apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M'))

In [None]:
# Format the DepTime Column

# converts the string into a military time format
delay_cleanup_df['DepTime'] = delay_cleanup_df['DepTime'].apply(lambda x: '{:02d}:{:02d}'.format(int(x) // 100, int(x) % 100))
# Convert string to correct midnight format
delay_cleanup_df['DepTime']=delay_cleanup_df['DepTime'].apply(lambda x: '00:00' if x == '24:00' else x)
# Converts values into datetime object
delay_cleanup_df['DepTime'] =delay_cleanup_df['DepTime'].apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M'))

In [317]:
delay_cleanup_df.drop(columns=['DepDel15','ArrDel15'])

Unnamed: 0,Flight_Number_Operating_Airline,ArrTime,ArrDel15,DepTime,DepDel15,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
19,1595,1214.0,1.0,1046.0,1.0,2.0,0.0,0.0,0.0,38.0
22,1598,1858.0,1.0,1530.0,1.0,91.0,0.0,12.0,0.0,0.0
60,1637,2304.0,1.0,2141.0,1.0,0.0,0.0,0.0,0.0,44.0
61,1637,2104.0,1.0,1944.0,1.0,0.0,0.0,98.0,0.0,0.0
92,1665,1618.0,1.0,1355.0,1.0,61.0,0.0,55.0,0.0,33.0
...,...,...,...,...,...,...,...,...,...,...
4078302,3420,1.0,1.0,2134.0,1.0,77.0,0.0,0.0,0.0,0.0
4078312,3408,1818.0,1.0,1725.0,1.0,0.0,0.0,76.0,0.0,0.0
4078313,3406,1630.0,1.0,1516.0,1.0,0.0,0.0,18.0,0.0,0.0
4078316,3401,1843.0,1.0,1458.0,1.0,0.0,0.0,43.0,0.0,0.0


In [386]:
# Rename Columns
delay_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'ArrTime':'Arrival_Time',
    'ArrDel15':'Arrival_Delayed',
    'DepTime':'Departure_Time',
    'DepDel15':'Departure_Delayed',
    'CarrierDelay':'Carrier_Delay',
    'WeatherDelay':'Weather_Delay',
    'NASDelay':'NAS_Delay',
    'SecurityDelay':'Security_Delay',
    'LateAircraftDelay':'Late_Aircraft_Delay'},
    inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [387]:
delay_cleanup_df

Unnamed: 0,Airline_Flight_Number,Arrival_Time,Arrival_Delayed,Departure_Time,Departure_Delayed,Carrier_Delay,Weather_Delay,NAS_Delay,Security_Delay,Late_Aircraft_Delay
7,1586,1435.0,0.0,1018.0,1.0,,,,,
19,1595,1214.0,1.0,1046.0,1.0,2.0,0.0,0.0,0.0,38.0
22,1598,1858.0,1.0,1530.0,1.0,91.0,0.0,12.0,0.0,0.0
27,1602,610.0,0.0,2324.0,1.0,,,,,
45,1620,1144.0,0.0,1024.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4078302,3420,1.0,1.0,2134.0,1.0,77.0,0.0,0.0,0.0,0.0
4078312,3408,1818.0,1.0,1725.0,1.0,0.0,0.0,76.0,0.0,0.0
4078313,3406,1630.0,1.0,1516.0,1.0,0.0,0.0,18.0,0.0,0.0
4078316,3401,1843.0,1.0,1458.0,1.0,0.0,0.0,43.0,0.0,0.0


#### Airport Codes Table Cleaning

In [368]:
# Import Table Data
airport_codes_cleanup_df = pd.read_csv(airport_codes_df_path,index_col=0)


In [370]:
# get only the Unique Origin Airport Codes
airport_codes_cleanup_df.drop_duplicates(subset=['Origin'],inplace=True)

In [371]:
# Split to create columns for Table
airport_codes_cleanup_df[['Airport_City','Airport_State']] = airport_codes_cleanup_df['OriginCityName'].str.split(', ',expand=True)

In [373]:
# Remove unnecesary Columns
airport_codes_cleanup_df.drop(columns=['OriginState','OriginCityName'],inplace=True)

In [377]:
# Clean the index
airport_codes_cleanup_df.reset_index(drop=True,inplace=True)

In [379]:
# Rename Columns
airport_codes_cleanup_df.rename(columns={'Origin':'Airport_Code'},inplace=True)

In [380]:
airport_codes_cleanup_df

Unnamed: 0,Airport_Code,Airport_City,Airport_State
0,FLL,Fort Lauderdale,FL
1,ATL,Atlanta,GA
2,JAN,Jackson/Vicksburg,MS
3,RIC,Richmond,VA
4,MSP,Minneapolis,MN
...,...,...,...
370,ACK,Nantucket,MA
371,GST,Gustavus,AK
372,HYA,Hyannis,MA
373,MVY,Martha's Vineyard,MA


#### Airline Table Cleaning

In [324]:
# Import Table Data
airlines_cleanup_df = pd.read_csv(airlines_df_path,index_col=0)


In [325]:
airlines_cleanup_df

Unnamed: 0_level_0,Description
Code,Unnamed: 1_level_1
02Q,Titan Airways
04Q,Tradewind Aviation
05Q,"Comlux Aviation, AG"
06Q,Master Top Linhas Aereas Ltd.
07Q,Flair Airlines Ltd.
...,...
ZW,Air Wisconsin Airlines Corp
ZX,Air Georgian
ZX (1),Airbc Ltd.
ZY,Atlantic Gulf Airlines


#### Cancelled Flights Table Cleaning

In [326]:
# Import Table Data
cancelled_cleanup_df = pd.read_csv(cancelled_df_path,index_col=0)

In [388]:
cancelled_cleanup_df.rename(columns={'Flight_Number_Operating_Airline':'Airline_Flight_Number','CancellationCode':'Cancellation_Code'},inplace=True)

In [389]:
cancelled_cleanup_df

Unnamed: 0,Airline_Flight_Number,Cancellation_Code
0,1581,A
30,1605,B
75,1650,B
148,1722,B
157,1729,B
...,...,...
4078270,3463,C
4078285,3443,C
4078290,3434,C
4078301,3421,C
