# STEP 1 - Import Dataset & Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#import data
flights_sample = pd.read_csv("../../data/raw/200K_random_flights(26Nov).csv")
#flights_sample = pd.read_csv("../../data/raw/FLIGHTS_TEST_RAW.csv")

trainingData = True #Switch if that's not the case

# STEP 2 - Explore & Clean the Data

Exploration is done in other notebooks, below is the cleaning process

## Common to all

In [4]:
### CLEANING CODE:
df = flights_sample #Change to your df's name

#### Drop columns:
df.drop('branded_code_share', axis=1, inplace=True) # Using Unique Carrier Code for analysis
df.drop('mkt_carrier', axis=1, inplace=True) # Using Unique Carrier Code for analysis
df.drop('mkt_carrier_fl_num', axis=1, inplace=True) #using op_carrier_fl_num instead
df.drop('origin_airport_id', axis=1, inplace=True) #working with IATA codes instead
df.drop('dest_airport_id', axis=1, inplace=True) #working with IATA codes instead
df.drop('dup', axis=1, inplace=True) # All the same value
df.drop('flights', axis=1, inplace=True) # All the same value

#### Change column type:
df['fl_date'] = df['fl_date'].astype('datetime64')  

#### Change column value:
df['crs_dep_time'] = pd.to_datetime(df['crs_dep_time'], unit='m', errors='coerce').dt.strftime("%H:%M")
df['crs_arr_time'] = pd.to_datetime(df['crs_arr_time'], unit='m', errors='coerce').dt.strftime("%H:%M")

#### Change column name:
df.rename(columns={'fl_date': 'Flight Date'}, inplace=True) 
df.rename(columns={'mkt_unique_carrier': 'Marketer - Unique Carrier Code'}, inplace=True)
df.rename(columns={'op_unique_carrier': 'Operator - Unique Carrier Code'}, inplace=True)
df.rename(columns={'op_carrier_fl_num': 'Flight Number'}, inplace=True)
df.rename(columns={'tail_num': 'Tail Number'}, inplace=True) 
#df.rename(columns={'origin_airport_id': 'Origin Airport (ID)'}, inplace=True) 
df.rename(columns={'origin': 'Origin Airport (IATA Code)'}, inplace=True) 
df.rename(columns={'origin_city_name': 'Origin Airport (City, State)'}, inplace=True) 
#df.rename(columns={'dest_airport_id': 'Destination Airport (ID)'}, inplace=True) 
df.rename(columns={'dest': 'Destination Airport (IATA Code)'}, inplace=True) 
df.rename(columns={'dest_city_name': 'Destination Airport (City, State)'}, inplace=True) 
df.rename(columns={'crs_dep_time': 'Scheduled Departure Time (local time)'}, inplace=True) 
df.rename(columns={'crs_arr_time': 'Scheduled Arrival Time (local time)'}, inplace=True)
df.rename(columns={'crs_elapsed_time': 'Scheduled Elapsed Time'}, inplace=True)
df.rename(columns={'distance': 'Distance (miles)'}, inplace=True)

# Is op_unique_carrier a duplicate of mkt_unique_carrier? No, we'll keep both and create an add'l column to highlight when they are not the same
df['Different Marketer & Operator Carrier Code'] = np.where(df['Marketer - Unique Carrier Code'] != df['Operator - Unique Carrier Code'], 1, 0)

# Create a column with the day/month/year of the flight
df['Flight Weekday'] = pd.DatetimeIndex(df['Flight Date']).weekday   #0: Monday, 1:Tuesday, etc.
df['Flight Day'] = pd.DatetimeIndex(df['Flight Date']).day
df['Flight Month'] = pd.DatetimeIndex(df['Flight Date']).month
df['Flight Year'] = pd.DatetimeIndex(df['Flight Date']).year
df.drop('Flight Date', axis=1, inplace=True) # Empty column

#df.head(10) #Final Review
flights_sample = df 

## Training Dataset only

In [5]:
if trainingData == True:
    ### CLEANING CODE (Trg only):
    df = flights_sample #

    # Remove cancelled flights
    df = df[df['cancelled'] == 0]

    # Remove diverted flights
    df = df[df['diverted'] == 0]

    # Removing dep_delays outliers
    low = df['dep_delay'].mean() - (3 * df['dep_delay'].std())
    high = df['dep_delay'].mean() + (3 * df['dep_delay'].std())
    df = df[(df['dep_delay'] > low) & (df['dep_delay'] < high)]

    # Removing arr_delays outliers
    low = df['arr_delay'].mean() - (3 * df['arr_delay'].std())
    high = df['arr_delay'].mean() + (3 * df['arr_delay'].std())
    df = df[(df['arr_delay'] > low) & (df['arr_delay'] < high)]


    #### Drop columns:
    df.drop('dep_time', axis=1, inplace=True)
    df.drop('taxi_out', axis=1, inplace=True)
    df.drop('taxi_in', axis=1, inplace=True)
    df.drop('wheels_off', axis=1, inplace=True)
    df.drop('wheels_on', axis=1, inplace=True)
    df.drop('arr_time', axis=1, inplace=True)
    df.drop('cancelled', axis=1, inplace=True)
    df.drop('cancellation_code', axis=1, inplace=True)
    df.drop('diverted', axis=1, inplace=True)
    df.drop('actual_elapsed_time', axis=1, inplace=True)
    df.drop('air_time', axis=1, inplace=True)
    df.drop('first_dep_time', axis=1, inplace=True) #99125  missing values
    df.drop('total_add_gtime', axis=1, inplace=True) #99125  missing values
    df.drop('longest_add_gtime', axis=1, inplace=True) #99125  missing values
    df.drop('no_name', axis=1, inplace=True) # Empty column

    ### Formatting the additional delay columns:
    # Departure Delay
    df['dep_delay'] = df['dep_delay'].fillna(0)    #Didn't really have an issue with this one, but just in case
    df['dep_delay'] = df['dep_delay'].astype('int64')
    df.rename(columns={'dep_delay': 'Arrival Delay (minutes)'}, inplace=True)
    
    # Arrival Delay
    df['arr_delay'] = df['arr_delay'].fillna(0)
    df['arr_delay'] = df['arr_delay'].astype('int64')
    df.rename(columns={'arr_delay': 'Departure Delay (minutes)'}, inplace=True)

    # carrier_delay
    df['carrier_delay'] = df['carrier_delay'].fillna(0)
    df['carrier_delay'] = df['carrier_delay'].astype('int64')
    df.rename(columns={'carrier_delay': 'Carrier Delay (minutes)'}, inplace=True)

    # weather_delay
    df['weather_delay'] = df['weather_delay'].fillna(0)
    df['weather_delay'] = df['weather_delay'].astype('int64')
    df.rename(columns={'weather_delay': 'Weather Delay (minutes)'}, inplace=True)

    # nas_delay
    df['nas_delay'] = df['nas_delay'].fillna(0)
    df['nas_delay'] = df['nas_delay'].astype('int64')
    df.rename(columns={'nas_delay': 'National Air System Delay (minutes)'}, inplace=True)

    # security_delay
    df['security_delay'] = df['security_delay'].fillna(0)
    df['security_delay'] = df['security_delay'].astype('int64')
    df.rename(columns={'security_delay': 'Security Delay (minutes)'}, inplace=True)

    # late_aircraft_delay
    df['late_aircraft_delay'] = df['late_aircraft_delay'].fillna(0)
    df['late_aircraft_delay'] = df['late_aircraft_delay'].astype('int64')
    df.rename(columns={'late_aircraft_delay': 'Late Aircraft Delay (minutes)'}, inplace=True)

    #df.head(10) #Final Review
    flights_sample = df 

# STEP 3 - Feature Engineering

## 3.1 - Importing enrichment data

### Add weather info

In [6]:
enr_weather = pd.read_csv("../../data/processed/flights_enrichment_weather.csv")

# Append the weather data to the flights_sample table on the year, month, day, and departure airport iata code columns with the year, month, day, and iata_code columns from the weather table
flights_sample = flights_sample.merge(enr_weather, left_on=["Flight Year", "Flight Month", "Flight Day", "Origin Airport (IATA Code)"], right_on=["Year", "Month", "Day", "iata_code"], how="left")

# Append the weather data to the flights_sample table on the year, month, day, and arrival airport iata code columns with the year, month, day, and iata_code columns from the weather table
flights_sample = flights_sample.merge(enr_weather, left_on=["Flight Year", "Flight Month", "Flight Day", "Destination Airport (IATA Code)"], right_on=["Year", "Month", "Day", "iata_code"], how="left", suffixes=("_dep", "_arr"))

flights_sample = flights_sample.drop(columns=["Year_dep", "Month_dep", "Day_dep", "iata_code_dep", "Year_arr", "Month_arr", "Day_arr", "iata_code_arr"])

### Add additional airport details

In [7]:
enr_airport = pd.read_csv("../../data/processed/flights_enrichment_airportLocation.csv")

# Append the airport data to the flights_sample table on the departure airport iata code column with the iata_code column from the airport table
flights_sample = flights_sample.merge(enr_airport, left_on="Origin Airport (IATA Code)", right_on="iata_code", how="left")

# Append the airport data to the flights_sample table on the arrival airport iata code column with the iata_code column from the airport table
flights_sample = flights_sample.merge(enr_airport, left_on="Destination Airport (IATA Code)", right_on="iata_code", how="left", suffixes=("_dep", "_arr"))

flights_sample = flights_sample.drop(columns=["iata_code_dep", "local_code_dep", "name_dep", "iata_code_arr", "local_code_arr", "name_arr"])

### Add Airport Busyness Score

In [8]:
departure_busyness_scores = pd.read_csv('../../data/processed/departure_busyness_scores.csv')
arrival_busyness_scores = pd.read_csv('../../data/processed/arrival_busyness_scores.csv')

#Create id for arrivals busyness for future merge
flights_sample['arrivals_busyness_id'] = 'A-' + flights_sample['Destination Airport (IATA Code)'] + '-' + flights_sample['Flight Month'].astype(str)+ '-' + flights_sample['Flight Weekday'].astype(str)

#Create id for departure busyness for future merge
flights_sample['departure_busyness_id'] = 'D-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Flight Month'].astype(str)+ '-' + flights_sample['Flight Weekday'].astype(str)

# Append the busyness score to the flights_sample table
flights_sample = flights_sample.merge(departure_busyness_scores, left_on="departure_busyness_id", right_on="Departure Busyness ID", how="left")
flights_sample = flights_sample.merge(arrival_busyness_scores, left_on="arrivals_busyness_id", right_on="Arrivals Busyness ID", how="left")


### Add additional loading and passenger details

In [9]:
passengers = pd.read_csv("../../data/raw/passengers_w_departuresPerformed_groupedbyMonth(29Nov).csv", index_col=False)

# Create routeid column (used to add enrichment)
flights_sample['routeid'] = flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Operator - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']

# Adding the enrichment
flights_sample = flights_sample.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))

### Removing columns that were added along the way

In [10]:
flights_sample = flights_sample.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'Arrivals Busyness ID', 'Departure Busyness ID', 'arrivals_busyness_id', 'departure_busyness_id', 'routeid', 'Unnamed: 0'])

## 3.2 - Variable Transformation

In [11]:
pd.set_option("display.max_columns", 120)
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187581 entries, 0 to 187580
Data columns (total 80 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Marketer - Unique Carrier Code              187581 non-null  object 
 1   Operator - Unique Carrier Code              187581 non-null  object 
 2   Tail Number                                 187581 non-null  object 
 3   Flight Number                               187581 non-null  int64  
 4   Origin Airport (IATA Code)                  187581 non-null  object 
 5   Origin Airport (City, State)                187581 non-null  object 
 6   Destination Airport (IATA Code)             187581 non-null  object 
 7   Destination Airport (City, State)           187581 non-null  object 
 8   Scheduled Departure Time (local time)       187581 non-null  object 
 9   Arrival Delay (minutes)                     187581 non-null  int64  
 

In [12]:
#scale, bin, labelencoding, etc

# STEP 4 - Dimension Reduction & Variable Selection

In [13]:
#PCA, OLS

# Export to CSV

In [15]:
flights_sample.to_csv('../../data/processed/flights.csv',index=False) #Training

#flights_sample.to_csv('../../data/processed/flights_test_enriched.csv',index=False) #Testing