### Import dependencies

In [13]:
import pandas as pd
from pathlib import Path

### Import Airline data

In [14]:
messy_flight_df = pd.DataFrame()

# loop through csv files 
for x in range(1,8):
    file = Path(f'Resources/Flights_2022_{x}.csv')
    import_df = pd.read_csv(file,low_memory=False,engine='c')
    messy_flight_df = pd.concat([messy_flight_df,import_df],ignore_index=True)

#### Export combined data into new csv

In [15]:
# USE IF NECESSARY

# export_path = Path('Resources/Flights_2022_Master.csv')
# messy_flight_df.to_csv(export_path,index=False)

#### Re-Import master csv

In [16]:
# USE IF NECESSARY

# messy_import_path = Path('Resources/Flights_2022_Master.csv')
# messy_flight_df = pd.read_csv(messy_import_path,low_memory=False)

### Separate master dataframe into each table

#### Airlines DF

In [None]:
airlines_df_path = Path('Resources/Airlines.csv')
Airlines_df = pd.read_csv(airlines_df_path)

#### Flights DF

In [None]:
flights_columns_list = [
    'Flight_Number_Operating_Airline',
    'Operating_Airline ',
    'Origin',
    'Dest',
    'FlightDate',
    'DepTime', # change from military to datetime
    'ArrTime', # change from military to datetime 
    'ArrDel15',
    'Cancelled'
    ]

In [None]:
flight_df_path = Path('Resources/Flights_Table.csv')
Flights_df = messy_flight_df[flights_columns_list]
Flights_df.to_csv(flight_df_path)

#### Delayed Flights DF

In [35]:
delayed_columns_list = [
    'Flight_Number_Operating_Airline',
    'ArrTime',
    'ArrDelayMinutes',
    'DepTime', # change from military to datetime
    'DepDelayMinutes',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
]

In [36]:
delayed_df_path = Path('Resources/Delayed_Flights_Table.csv')
Delayed_Flights_df = messy_flight_df[delayed_columns_list]
Delayed_Flights_df.to_csv(delayed_df_path)

#### Airport Codes DF

In [None]:
airport_codes_list = [
    'Origin',
    'OriginCityName',
    'OriginState'
]

In [None]:
airport_codes_df_path = Path('Resources/Airport_Codes_Table.csv')
Airport_Codes_df = messy_flight_df[airport_codes_list]
Airport_Codes_df.to_csv(airport_codes_df_path)

#### Cancelled Flights DF

In [None]:
cancelled_flights_list = [
    'Flight_Number_Operating_Airline',
    'CancellationCode'
]

In [None]:
cancelled_df_path = Path('Resources/Cancelled_Flights_Table.csv')
Cancelled_Flights_df = messy_flight_df.loc[messy_flight_df['Cancelled']==1][cancelled_flights_list]
Cancelled_Flights_df.to_csv(cancelled_df_path)

### Format Data

In [None]:

def converter (series):
    # converts the string into a military time format
    series = series.apply(lambda x: '{:02d}:{:02d}'.format(int(x) // 100, int(x) % 100) if pd.notnull(x) else x)
    # Convert string to correct midnight format
    series=series.apply(lambda x: '00:00' if x == '24:00' else x)
    # Converts values into datetime object
    series =series.apply(lambda x: pd.to_datetime(x,format='%H:%M').strftime('%H:%M') if pd.notnull(x) else x)
    return series

#### Flight Table Cleaning

In [None]:
# Import Table Data
flight_cleanup_df = pd.read_csv(flight_df_path,index_col=0)


In [None]:

# converts the string into a military time format
flight_cleanup_df['ArrTime'] =converter(flight_cleanup_df['ArrTime'])

flight_cleanup_df['DepTime'] =converter(flight_cleanup_df['DepTime'])

In [None]:
# Convert FlightDate to datetime object
flight_cleanup_df['FlightDate'] =flight_cleanup_df['FlightDate'].apply(lambda x: pd.to_datetime(x) if pd.notnull(x) else x)

In [None]:
# Rename Columns
flight_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'Operating_Airline ':'Operating_Airline',
    'Origin':'Origin_Airport',
    'Dest':'Destination_Airport',
    'FlightDate':'Flight_Date',
    'DepTime':'Departure_Time',
    'ArrTime':'Arrival_Time',
    'ArrDel15':'Arrival_Delayed',
    'Cancelled':'Cancelled'},inplace=True)

In [None]:
flight_cleanup_df

In [None]:
pandas.to_timedelta(arg, unit=None, errors='raise')

In [None]:
# Export cleaned Dataframe to csv to use as Table

cleaned_flight_path = Path('Resources/DB_Flights_Table.csv')
flight_cleanup_df.to_csv(cleaned_flight_path)

#### Delayed Flight Table Cleaning

In [41]:
# Import Table Data
delay_cleanup_df = pd.read_csv(delayed_df_path,index_col=0)


In [42]:
# Select only flights that have been delayed
delay_cleanup_df = delay_cleanup_df.loc[(delay_cleanup_df['DepDelayMinutes']>0) | (delay_cleanup_df['ArrDelayMinutes']>0)]

In [None]:
delay_cleanup_df['ArrTime'] = converter(delay_cleanup_df['ArrTime'])
delay_cleanup_df['DepTime'] = converter(delay_cleanup_df['DepTime'])


In [None]:
# Rename Columns
delay_cleanup_df.rename(columns={
    'Flight_Number_Operating_Airline':'Airline_Flight_Number',
    'ArrTime':'Arrival_Time',
    'ArrDelayMinutes':'Arrival_Delayed_Minutes',
    'DepTime':'Departure_Time',
    'DepDelayMinutes':'Departure_Delayed_Minutes',
    'CarrierDelay':'Carrier_Delay_Minutes',
    'WeatherDelay':'Weather_Delay_Minutes',
    'NASDelay':'NAS_Delay_Minutes',
    'SecurityDelay':'Security_Delay_Minutes',
    'LateAircraftDelay':'Late_Aircraft_Delay_Minutes'},
    inplace=True)

In [None]:
delay_cleanup_df

In [None]:
delay_cleanup_df.columns

In [None]:
#Convert Columns from int to Datetime 

delay_cleanup_df['Carrier_Delay_Minutes'] = pd.to_timedelta(delay_cleanup_df['Carrier_Delay_Minutes'],unit='m')

delay_cleanup_df['Weather_Delay_Minutes'] = pd.to_timedelta(delay_cleanup_df['Weather_Delay_Minutes'],unit='m')

delay_cleanup_df['Departure_Delayed_Minutes'] = pd.to_timedelta(delay_cleanup_df['Departure_Delayed_Minutes'],unit='m')

delay_cleanup_df['Arrival_Delayed_Minutes'] = pd.to_timedelta(delay_cleanup_df['Arrival_Delayed_Minutes'],unit='m')

delay_cleanup_df['NAS_Delay_Minutes'] = pd.to_timedelta(delay_cleanup_df['NAS_Delay_Minutes'],unit='m')

delay_cleanup_df['Security_Delay_Minutes'] = pd.to_timedelta(delay_cleanup_df['Security_Delay_Minutes'],unit='m')

delay_cleanup_df['Late_Aircraft_Delay_Minutes'] = pd.to_timedelta(delay_cleanup_df['Late_Aircraft_Delay_Minutes'],unit='m')

delay_cleanup_df

In [None]:
# Export cleaned Dataframe to csv to use as Table

cleaned_delay_path = Path('Resources/DB_Delayed_Flights_Table.csv')
delay_cleanup_df.to_csv(cleaned_delay_path)

#### Airport Codes Table Cleaning

In [368]:
# Import Table Data
airport_codes_cleanup_df = pd.read_csv(airport_codes_df_path,index_col=0)


In [370]:
# get only the Unique Origin Airport Codes
airport_codes_cleanup_df.drop_duplicates(subset=['Origin'],inplace=True)

In [371]:
# Split to create columns for Table
airport_codes_cleanup_df[['Airport_City','Airport_State']] = airport_codes_cleanup_df['OriginCityName'].str.split(', ',expand=True)

In [373]:
# Remove unnecesary Columns
airport_codes_cleanup_df.drop(columns=['OriginState','OriginCityName'],inplace=True)

In [377]:
# Clean the index
airport_codes_cleanup_df.reset_index(drop=True,inplace=True)

In [379]:
# Rename Columns
airport_codes_cleanup_df.rename(columns={'Origin':'Airport_Code'},inplace=True)

In [380]:
airport_codes_cleanup_df

Unnamed: 0,Airport_Code,Airport_City,Airport_State
0,FLL,Fort Lauderdale,FL
1,ATL,Atlanta,GA
2,JAN,Jackson/Vicksburg,MS
3,RIC,Richmond,VA
4,MSP,Minneapolis,MN
...,...,...,...
370,ACK,Nantucket,MA
371,GST,Gustavus,AK
372,HYA,Hyannis,MA
373,MVY,Martha's Vineyard,MA


In [None]:
# Export cleaned Dataframe to csv to use as Table

cleaned_airport_codes_path = Path('Resources/DB_Airport_Codes.csv')
airport_codes_cleanup_df.to_csv(cleaned_airport_codes_path)

#### Airline Table Cleaning

In [415]:
# Import Table Data
airlines_cleanup_df = pd.read_csv(airlines_df_path)


In [417]:
# Rename columns

airlines_cleanup_df.rename(columns={
    'Code':'Airline_Code',
    'Description':'Airline_Name'},inplace=True)

In [418]:
airlines_cleanup_df

Unnamed: 0,Airline_Code,Airline_Name
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.
...,...,...
1566,ZW,Air Wisconsin Airlines Corp
1567,ZX,Air Georgian
1568,ZX (1),Airbc Ltd.
1569,ZY,Atlantic Gulf Airlines


In [None]:
# Export cleaned Dataframe to csv to use as Table

cleaned_airlines_path = Path('Resources/DB_Airline_Table.csv')
airlines_cleanup_df.to_csv(cleaned_airlines_path)

#### Cancelled Flights Table Cleaning

In [326]:
# Import Table Data
cancelled_cleanup_df = pd.read_csv(cancelled_df_path,index_col=0)

In [388]:
cancelled_cleanup_df.rename(columns={'Flight_Number_Operating_Airline':'Airline_Flight_Number','CancellationCode':'Cancellation_Code'},inplace=True)

In [389]:
cancelled_cleanup_df

Unnamed: 0,Airline_Flight_Number,Cancellation_Code
0,1581,A
30,1605,B
75,1650,B
148,1722,B
157,1729,B
...,...,...
4078270,3463,C
4078285,3443,C
4078290,3434,C
4078301,3421,C


In [None]:
# Export cleaned Dataframe to csv to use as Table

cancelled_flight_path = Path('Resources/DB_Cancelled_Flights_Table.csv')
cancelled_cleanup_df.to_csv(cancelled_flight_path)