# Data Cleaning Code Generation

In [1]:
def dataCleaning(df, code=True, tips=False, orientation=True, formatIssues=True, missingValues=True, duplicateValues=True, outliers=True):
    """
    ------------------
    Consolidation of the usual data cleaning steps for a df, November 2022
    Made by Sebastien Garneau, sebastien.garneau@gmail.com
    ------------------
    df: your dataframe

    code: A text template to note your observations as you go. Use the code snippets included in the output. copy-paste into vscode/notepad

    tips: Provides snippets of code to help you clean potential issues in your df. If you prefer this to code
    
    orientation: Provides information about the shape/objects of your data
    
    formatIssues: Provides detailed information on each column to help identify format issues
    
    missingValues: Provides information on missing values
    
    duplicateValues: Provides information on duplicate values
    
    outliers: Provides information on outliers
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    if code==True:
        print("### CLEANING CODE:")
        print("df = dfX #Change to your df's name")
        print()
        print("#### Change column value:")
        print()
        print()
        print("#### Drop entire column:")
        print()
        print()
        print("#### Change column type:")
        print()
        print()
        print("#### Change column name:")
        print()
        print()
        print("#### Handle missing values:")
        print()
        print()
        print("#### Handle duplicate values:")
        print("# df.drop_duplicates(inplace=True) # drop ALL duplicate rows")
        print()
        print("#### Drop outliers:")
        print()
        print()
        print("#### Other observations / further investigations:")
        print("#")
        print("#")
        print("#")
        print()
        print("df.head() #Final Review")
        print("# dfX = df #Change to your df's name")
        print()
        print("=========================================")
    
    if orientation==True:
        print("ORIENTATION")
        print(df.info())
        print("=========================================")
        print()
        
    
    if formatIssues==True:
        print("FORMAT ISSUES")
        print()
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'datetime64':
                print("df.rename(columns={'" + col + "': ''}, inplace=True)", "#rename column")
                print("df['" + col + "'] = df['" + col + "'].replace('old_value', 'new_value')")
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print("df.drop('" + col + "', axis=1, inplace=True)")                
                pd.set_option('display.max_rows', None)
                print(df.groupby(col, sort=True).size())
                pd.reset_option('display.max_rows')
                #display the dtypes of the column
                print("Current Column DType: ", df[col].dtype, "     Do not compare with above. This one will always return int64 as it's the dtype of the count")                
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print()
            #else:
            #    print(col)
            #    print(df[col].describe())
            #    print()

        if tips==True:
            print("TIPS")
            print("To make a correction to a column, use the following syntax:")
            print("df['A'] = df['A'].apply(lambda x: x.replace('old_value', 'new_value'))")
            print()
            print("To change a column data type, use the following syntax:")
            print("df['A'] = pd.to_datetime(df['A']) # for datetime")
            print("df['A'] = df['A'].astype('int64') # for integers")
            print("df['A'] = df['A'].astype('float64') # for floats")
            print("df['A'] = df['A'].astype('category') # for categorical")
            print("df['A'] = df['A'].astype('object') # for object")
            print()
        print("=========================================")
        print()

    if missingValues==True:
        print("MISSING VALUES")
        print()
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                print(col, ":", df[col].isnull().sum(), " missing values")
                print("df.dropna(subset=['" + col + "'], inplace=True)")
                print("df['" + col + "'].fillna(df['" + col + "'].mean(), inplace=True) #fill NA entries with the mean")
                print("df['" + col + "'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
                print()
                print(df.loc[df[col].isnull()].head())
                print()
            else:
                print(col, ": No missing values")
                print()
                                    
        if tips==True:
            print()
            print("TIPS")
            print("You can drop rows with missing values using one of the following code:")
            print("df.dropna(subset=['col'], inplace=True) #For a single column")
            print("df.dropna(inplace=True) #For all columns")
            print()
            print("You can fill rows with missing values using one of the following code:")
            print("df['col'].fillna(df['col'].mean(), inplace=True) #fill NA entries with the mean")
            print("df['col'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
            print("df['col'].fillna(method='ffill') # forward-fill to propagate the previous value forward")
            print("df['col'].fillna(method='bfill' # back-fill to propagate the next values backward)")
            print()
            print("To view them:")
            print("df.loc[df[col].isnull()].head()")
            print()
        print("=========================================")
        print()

    if duplicateValues==True:
        print("DUPLICATE VALUES")
        print()
        print(df[df.duplicated()].head())
        print()

        if tips==True:
            print("TIPS")
            print("You can drop duplicate rows using the following code:")
            print("df.drop_duplicates(inplace=True)")
            print("df.drop_duplicates(subset=['col'], inplace=True) #For a single column")
            print()
            print("To view them:")
            print("df[df.duplicated()].head()")
            print()
    
        print("=========================================")
        print()

    if outliers==True:
        print("OUTLIERS")
        print()
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                print(col)
                print("-----")
                print("Outlier(s):")
                print("Below ", df[col].mean() - 3*df[col].std(), " -> ", df[df[col] < df[col].mean() - 3*df[col].std()].shape[0], " low outlier(s)")
                print("Above ", df[col].mean() + 3*df[col].std(), " -> ", df[df[col] > df[col].mean() + 3*df[col].std()].shape[0], " high outlier(s)")
                low = df[col].mean() - 3*df[col].std()
                high = df[col].mean() + 3*df[col].std()
                print("df = df[(df['" + col + "'] > " + str(low) + ") & (df['" + col + "'] < " + str(high) + ")]")
                print()
                print(df[col].describe())
                print()
                print("Boxplot")
                sns.boxplot(df[col])
                plt.show()
                print()
                print("Histogram")
                sns.histplot(df[col])
                plt.show()
                print("=========================================")
                print()

        if tips==True:
            print("TIPS")
            print("You can drop outliers using the following code:")
            print("df = df[(df['column'] > lower_bound) & (df['column'] < upper_bound)]")
            print()

# Import Dataset & Librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#import training data
flights_sample = pd.read_csv("../../data/raw/Cleaned-flights_sample_training.csv")
trainingData = True #Switch if that's not the case

In [4]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150186 entries, 0 to 150185
Data columns (total 25 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Marketer - Unique Carrier Code              150186 non-null  object 
 1   Operator - Unique Carrier Code              150186 non-null  object 
 2   Tail Number                                 150186 non-null  object 
 3   Flight Number                               150186 non-null  int64  
 4   Origin Airport (IATA Code)                  150186 non-null  object 
 5   Destination Airport (IATA Code)             150186 non-null  object 
 6   Scheduled Departure Time (local time)       150186 non-null  object 
 7   Departure Delay (minutes)                   150186 non-null  int64  
 8   Scheduled Arrival Time (local time)         150186 non-null  object 
 9   Arrival Delay (minutes)                     150186 non-null  int64  
 

In [5]:
# #import test data
# flights_test = pd.read_csv("../../data/raw/flights_test.csv")
# trainingData = False #Switch if that's not the case

# Apply Data Cleaning (See Step 2 Notebooks for further details)

Exploration is done in other notebooks, below is the cleaning process

## 2a: Common to all

In [6]:
### CLEANING CODE:
df = flights_sample #Change to your df's name

# #### Drop columns:
# df.drop('branded_code_share', axis=1, inplace=True) # Using Unique Carrier Code for analysis
# df.drop('mkt_carrier', axis=1, inplace=True) # Using Unique Carrier Code for analysis
# df.drop('mkt_carrier_fl_num', axis=1, inplace=True) #using op_carrier_fl_num instead
# df.drop('origin_airport_id', axis=1, inplace=True) #working with IATA codes instead
# df.drop('dest_airport_id', axis=1, inplace=True) #working with IATA codes instead
# df.drop('dup', axis=1, inplace=True) # All the same value
# df.drop('flights', axis=1, inplace=True) # All the same value

# #### Change column type:
# df['fl_date'] = df['fl_date'].astype('datetime64')  

# #### Change column value:
# df['crs_dep_time'] = pd.to_datetime(df['crs_dep_time'], unit='m', errors='coerce').dt.strftime("%H:%M")
# df['crs_arr_time'] = pd.to_datetime(df['crs_arr_time'], unit='m', errors='coerce').dt.strftime("%H:%M")

# #### Change column name:
# df.rename(columns={'fl_date': 'Flight Date'}, inplace=True) 
# df.rename(columns={'mkt_unique_carrier': 'Marketer - Unique Carrier Code'}, inplace=True)
# df.rename(columns={'op_unique_carrier': 'Operator - Unique Carrier Code'}, inplace=True)
# df.rename(columns={'op_carrier_fl_num': 'Flight Number'}, inplace=True)
# df.rename(columns={'tail_num': 'Tail Number'}, inplace=True) 
# #df.rename(columns={'origin_airport_id': 'Origin Airport (ID)'}, inplace=True) 
# df.rename(columns={'origin': 'Origin Airport (IATA Code)'}, inplace=True) 
# df.rename(columns={'origin_city_name': 'Origin Airport (City, State)'}, inplace=True) 
# #df.rename(columns={'dest_airport_id': 'Destination Airport (ID)'}, inplace=True) 
# df.rename(columns={'dest': 'Destination Airport (IATA Code)'}, inplace=True) 
# df.rename(columns={'dest_city_name': 'Destination Airport (City, State)'}, inplace=True) 
# df.rename(columns={'crs_dep_time': 'Scheduled Departure Time (local time)'}, inplace=True) 
# df.rename(columns={'crs_arr_time': 'Scheduled Arrival Time (local time)'}, inplace=True)
# df.rename(columns={'crs_elapsed_time': 'Scheduled Elapsed Time'}, inplace=True)
# df.rename(columns={'distance': 'Distance (miles)'}, inplace=True)

# # Is op_unique_carrier a duplicate of mkt_unique_carrier? No, we'll keep both and create an add'l column to highlight when they are not the same
# df['Different Marketer & Operator Carrier Code'] = np.where(df['Marketer - Unique Carrier Code'] != df['Operator - Unique Carrier Code'], 1, 0)

# # Create a column with the day/month/year of the flight
# df['Flight Weekday'] = pd.DatetimeIndex(df['Flight Date']).weekday   #0: Monday, 1:Tuesday, etc.
# df['Flight Day'] = pd.DatetimeIndex(df['Flight Date']).day
# df['Flight Month'] = pd.DatetimeIndex(df['Flight Date']).month
# df['Flight Year'] = pd.DatetimeIndex(df['Flight Date']).year
# df.drop('Flight Date', axis=1, inplace=True) # Empty column

#df.head(10) #Final Review
flights_sample = df


In [7]:
# flights_sample.head(2)

## 2b: Training Dataset only

In [8]:
if trainingData == True:
    ### CLEANING CODE (Trg only):
    df = flights_sample #

    # Remove cancelled flights
    df = df[df['cancelled'] == 0]

    # Remove diverted flights
    df = df[df['diverted'] == 0]

#     # Removing dep_delays outliers
#     low = df['dep_delay'].mean() - (3 * df['dep_delay'].std())
#     high = df['dep_delay'].mean() + (3 * df['dep_delay'].std())
#     df = df[(df['dep_delay'] > low) & (df['dep_delay'] < high)]

#     # Removing arr_delays outliers
#     low = df['arr_delay'].mean() - (3 * df['arr_delay'].std())
#     high = df['arr_delay'].mean() + (3 * df['arr_delay'].std())
#     df = df[(df['arr_delay'] > low) & (df['arr_delay'] < high)]


#     #### Drop columns:
#     df.drop('dep_time', axis=1, inplace=True)
#     df.drop('taxi_out', axis=1, inplace=True)
#     df.drop('taxi_in', axis=1, inplace=True)
#     df.drop('wheels_off', axis=1, inplace=True)
#     df.drop('wheels_on', axis=1, inplace=True)
#     df.drop('arr_time', axis=1, inplace=True)
    df.drop('cancelled', axis=1, inplace=True)
    df.drop('cancellation_code', axis=1, inplace=True)
    df.drop('diverted', axis=1, inplace=True)
#     df.drop('actual_elapsed_time', axis=1, inplace=True)
#     df.drop('air_time', axis=1, inplace=True)
#     df.drop('first_dep_time', axis=1, inplace=True) #99125  missing values
#     df.drop('total_add_gtime', axis=1, inplace=True) #99125  missing values
#     df.drop('longest_add_gtime', axis=1, inplace=True) #99125  missing values
#     df.drop('no_name', axis=1, inplace=True) # Empty column

#     ### Formatting the additional delay columns:
#     # Departure Delay
#     df['dep_delay'] = df['dep_delay'].fillna(0)    #Didn't really have an issue with this one, but just in case
#     df['dep_delay'] = df['dep_delay'].astype('int64')
#     df.rename(columns={'dep_delay': 'Arrival Delay (minutes)'}, inplace=True)
    
#     # Arrival Delay
#     df['arr_delay'] = df['arr_delay'].fillna(0)
#     df['arr_delay'] = df['arr_delay'].astype('int64')
#     df.rename(columns={'arr_delay': 'Departure Delay (minutes)'}, inplace=True)

#     # carrier_delay
#     df['carrier_delay'] = df['carrier_delay'].fillna(0)
#     df['carrier_delay'] = df['carrier_delay'].astype('int64')
#     df.rename(columns={'carrier_delay': 'Carrier Delay (minutes)'}, inplace=True)

#     # weather_delay
#     df['weather_delay'] = df['weather_delay'].fillna(0)
#     df['weather_delay'] = df['weather_delay'].astype('int64')
#     df.rename(columns={'weather_delay': 'Weather Delay (minutes)'}, inplace=True)

#     # nas_delay
#     df['nas_delay'] = df['nas_delay'].fillna(0)
#     df['nas_delay'] = df['nas_delay'].astype('int64')
#     df.rename(columns={'nas_delay': 'National Air System Delay (minutes)'}, inplace=True)

#     # security_delay
#     df['security_delay'] = df['security_delay'].fillna(0)
#     df['security_delay'] = df['security_delay'].astype('int64')
#     df.rename(columns={'security_delay': 'Security Delay (minutes)'}, inplace=True)

#     # late_aircraft_delay
#     df['late_aircraft_delay'] = df['late_aircraft_delay'].fillna(0)
#     df['late_aircraft_delay'] = df['late_aircraft_delay'].astype('int64')
#     df.rename(columns={'late_aircraft_delay': 'Late Aircraft Delay (minutes)'}, inplace=True)

#     #df.head(10) #Final Review
    flights_sample = df 

In [9]:
flights_sample.info(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150186 entries, 0 to 150185
Data columns (total 22 columns):
 #   Column                                      Non-Null Count   Dtype 
---  ------                                      --------------   ----- 
 0   Marketer - Unique Carrier Code              150186 non-null  object
 1   Operator - Unique Carrier Code              150186 non-null  object
 2   Tail Number                                 150186 non-null  object
 3   Flight Number                               150186 non-null  int64 
 4   Origin Airport (IATA Code)                  150186 non-null  object
 5   Destination Airport (IATA Code)             150186 non-null  object
 6   Scheduled Departure Time (local time)       150186 non-null  object
 7   Departure Delay (minutes)                   150186 non-null  int64 
 8   Scheduled Arrival Time (local time)         150186 non-null  object
 9   Arrival Delay (minutes)                     150186 non-null  int64 
 10  Schedule

## 2c: Training and Test Set

In [10]:
# Note: the logic may need to be slightly fixed here. 
#Mel question - shouldn't this happen if training data is true or false? 

# if trainingData == True:
#     ### CLEANING CODE (Trg only):
#     df = flights_sample #

    # #### Drop columns:
    # df.drop('dep_time', axis=1, inplace=True)
    # df.drop('taxi_out', axis=1, inplace=True)
    # df.drop('taxi_in', axis=1, inplace=True)
    # df.drop('wheels_off', axis=1, inplace=True)
    # df.drop('wheels_on', axis=1, inplace=True)
    # df.drop('arr_time', axis=1, inplace=True)
    # df.drop('cancelled', axis=1, inplace=True)
    # df.drop('cancellation_code', axis=1, inplace=True)
    # df.drop('diverted', axis=1, inplace=True)
    # df.drop('actual_elapsed_time', axis=1, inplace=True)
    # df.drop('air_time', axis=1, inplace=True)
    # df.drop('first_dep_time', axis=1, inplace=True) #99125  missing values
    # df.drop('total_add_gtime', axis=1, inplace=True) #99125  missing values
    # df.drop('longest_add_gtime', axis=1, inplace=True) #99125  missing values
    # df.drop('no_name', axis=1, inplace=True) # Empty column
    # df.drop('Origin Airport (City, State)', axis=1, inplace=True) # Empty column
    # df.drop('Destination Airport (City, State)', axis=1, inplace=True) # Empty column

    ### Formatting the additional delay columns:
    # Departure Delay
#     df['dep_delay'] = df['dep_delay'].fillna(0)    #Didn't really have an issue with this one, but just in case
#     df['dep_delay'] = df['dep_delay'].astype('int64')
#     df.rename(columns={'dep_delay': 'Departure Delay (minutes)'}, inplace=True)
    
#     # Arrival Delay
#     df['arr_delay'] = df['arr_delay'].fillna(0)
#     df['arr_delay'] = df['arr_delay'].astype('int64')
#     df.rename(columns={'arr_delay': 'Arrival Delay (minutes)'}, inplace=True)

#     # carrier_delay
#     df['carrier_delay'] = df['carrier_delay'].fillna(0)
#     df['carrier_delay'] = df['carrier_delay'].astype('int64')
#     df.rename(columns={'carrier_delay': 'Carrier Delay (minutes)'}, inplace=True)

#     # weather_delay
#     df['weather_delay'] = df['weather_delay'].fillna(0)
#     df['weather_delay'] = df['weather_delay'].astype('int64')
#     df.rename(columns={'weather_delay': 'Weather Delay (minutes)'}, inplace=True)

#     # nas_delay
#     df['nas_delay'] = df['nas_delay'].fillna(0)
#     df['nas_delay'] = df['nas_delay'].astype('int64')
#     df.rename(columns={'nas_delay': 'National Air System Delay (minutes)'}, inplace=True)

#     # security_delay
#     df['security_delay'] = df['security_delay'].fillna(0)
#     df['security_delay'] = df['security_delay'].astype('int64')
#     df.rename(columns={'security_delay': 'Security Delay (minutes)'}, inplace=True)

#     # late_aircraft_delay
#     df['late_aircraft_delay'] = df['late_aircraft_delay'].fillna(0)
#     df['late_aircraft_delay'] = df['late_aircraft_delay'].astype('int64')
#     df.rename(columns={'late_aircraft_delay': 'Late Aircraft Delay (minutes)'}, inplace=True)

#     #df.head(10) #Final Review
#     flights_sample = df 

In [11]:
# flights_sample.head(2)

# STEP 3 - Feature Engineering

## 3.1 - Importing enrichment data

### 3.1a: Import weather info

In [12]:
enr_weather = pd.read_csv("../../data/processed/flights_enrichment_weather.csv")

# Append the weather data to the flights_sample table on the year, month, day, and departure airport iata code columns with the year, month, day, and iata_code columns from the weather table
flights_sample = flights_sample.merge(enr_weather, left_on=["Flight Year", "Flight Month", "Flight Day", "Origin Airport (IATA Code)"], right_on=["Year", "Month", "Day", "iata_code"], how="left")

# Append the weather data to the flights_sample table on the year, month, day, and arrival airport iata code columns with the year, month, day, and iata_code columns from the weather table
flights_sample = flights_sample.merge(enr_weather, left_on=["Flight Year", "Flight Month", "Flight Day", "Destination Airport (IATA Code)"], right_on=["Year", "Month", "Day", "iata_code"], how="left", suffixes=("_dep", "_arr"))

flights_sample = flights_sample.drop(columns=["Year_dep", "Month_dep", "Day_dep", "iata_code_dep", "Year_arr", "Month_arr", "Day_arr", "iata_code_arr"])

In [13]:
flights_sample.head(2)

Unnamed: 0,Marketer - Unique Carrier Code,Operator - Unique Carrier Code,Tail Number,Flight Number,Origin Airport (IATA Code),Destination Airport (IATA Code),Scheduled Departure Time (local time),Departure Delay (minutes),Scheduled Arrival Time (local time),Arrival Delay (minutes),...,Heavy_Fog_arr,Thunder_arr,Ice_Pellets_arr,Hail_arr,Glaze_or_Rime_arr,Dust_or_Sand_arr,Smoke_or_Haze_arr,Blowing or Drifting Snow_arr,Tornado_or_Funnel_Cloud_arr,High_or_Damaging_Winds_arr
0,DL,9E,N909XJ,5011,BOS,BUF,11:38,20,14:49,-1,...,,,,,1.0,,,,,
1,AA,YX,N120HQ,4416,HVN,CLT,18:56,-8,22:30,-7,...,,,,,,,1.0,,,


In [14]:
# There are lots of NaNs in the weather columns, including for the temperature columns like TAVG. Is this going to cause a problem?

### 3.1b: Import additional airport details

In [15]:
enr_airport = pd.read_csv("../../data/processed/flights_enrichment_airportLocation.csv")

# Append the airport data to the flights_sample table on the departure airport iata code column with the iata_code column from the airport table
flights_sample = flights_sample.merge(enr_airport, left_on="Origin Airport (IATA Code)", right_on="iata_code", how="left")

# Append the airport data to the flights_sample table on the arrival airport iata code column with the iata_code column from the airport table
flights_sample = flights_sample.merge(enr_airport, left_on="Destination Airport (IATA Code)", right_on="iata_code", how="left", suffixes=("_dep", "_arr"))

flights_sample = flights_sample.drop(columns=["iata_code_dep", "local_code_dep", "name_dep", "iata_code_arr", "local_code_arr", "name_arr", "country_name_dep", "country_name_arr"])

In [16]:
flights_sample.head(2)

Unnamed: 0,Marketer - Unique Carrier Code,Operator - Unique Carrier Code,Tail Number,Flight Number,Origin Airport (IATA Code),Destination Airport (IATA Code),Scheduled Departure Time (local time),Departure Delay (minutes),Scheduled Arrival Time (local time),Arrival Delay (minutes),...,latitude_deg_dep,longitude_deg_dep,elevation_ft_dep,type_dep,local_region_dep,latitude_deg_arr,longitude_deg_arr,elevation_ft_arr,type_arr,local_region_arr
0,DL,9E,N909XJ,5011,BOS,BUF,11:38,20,14:49,-1,...,42.3643,-71.005203,20.0,large_airport,MA,42.940498,-78.732201,728.0,large_airport,NY
1,AA,YX,N120HQ,4416,HVN,CLT,18:56,-8,22:30,-7,...,41.263699,-72.886803,12.0,medium_airport,CT,35.214001,-80.9431,748.0,large_airport,NC


In [17]:
# Note that I removed: "country_name_dep", "country_name_arr" - all should be US, would have been introducing duplicate info. 

### 3.1c: Add Airport Busyness Score

In [18]:
departure_busyness_scores = pd.read_csv('../../data/processed/departure_busyness_scores.csv')
arrival_busyness_scores = pd.read_csv('../../data/processed/arrival_busyness_scores.csv')

In [19]:
departure_busyness_scores.head()

Unnamed: 0,Departure Busyness ID,Departure Busyness Score
0,D-ABE-1-0,0.843011
1,D-ABE-1-1,0.716222
2,D-ABE-1-2,0.782988
3,D-ABE-1-3,0.856499
4,D-ABE-1-4,0.857342


In [20]:
departure_busyness_scores = pd.read_csv('../../data/processed/departure_busyness_scores.csv')
arrival_busyness_scores = pd.read_csv('../../data/processed/arrival_busyness_scores.csv')

#Create id for arrivals busyness for future merge
flights_sample['arrivals_busyness_id'] = 'A-' + flights_sample['Destination Airport (IATA Code)'] + '-' + flights_sample['Flight Month'].astype(str)+ '-' + flights_sample['Flight Weekday'].astype(str)

#Create id for departure busyness for future merge
flights_sample['departure_busyness_id'] = 'D-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Flight Month'].astype(str)+ '-' + flights_sample['Flight Weekday'].astype(str)

# Append the busyness score to the flights_sample table
flights_sample = flights_sample.merge(departure_busyness_scores, left_on="departure_busyness_id", right_on="Departure Busyness ID", how="left")
flights_sample = flights_sample.merge(arrival_busyness_scores, left_on="arrivals_busyness_id", right_on="Arrivals Busyness ID", how="left")

# # for training only
# flights_sample['Arrival Delay (minutes)'] = np.where(flights_sample['Arrival Delay (minutes)'] < 0, 0, flights_sample['Arrival Delay (minutes)'])


### Add additional loading and passenger details

In [21]:
passengers = pd.read_csv("../../data/raw/passengers_w_departuresPerformed_groupedbyMonth(29Nov).csv", index_col=False)

# Create routeid column (used to add enrichment)
flights_sample['routeid'] = flights_sample['Flight Month'].astype(str) + '-' + flights_sample['Operator - Unique Carrier Code'] + '-' + flights_sample['Origin Airport (IATA Code)'] + '-' + flights_sample['Destination Airport (IATA Code)']

# Adding the enrichment
flights_sample = flights_sample.merge(passengers, on="routeid", how="left", suffixes=("_1", "_2"))

In [22]:
flights_sample.head(2)

Unnamed: 0,Marketer - Unique Carrier Code,Operator - Unique Carrier Code,Tail Number,Flight Number,Origin Airport (IATA Code),Destination Airport (IATA Code),Scheduled Departure Time (local time),Departure Delay (minutes),Scheduled Arrival Time (local time),Arrival Delay (minutes),...,averagepayload_lbs,averagefreight_lbs,averagemail_lbs,availableseats,averagepassengers,aircraftgroup,aircrafttype,aircraftconfiguration,distanceinterval_x500mi,serviceclass
0,DL,9E,N909XJ,5011,BOS,BUF,11:38,20,14:49,-1,...,12500.0,0.0,0.0,49.983051,29.966102,6.0,638.0,1.0,1.0,F
1,AA,YX,N120HQ,4416,HVN,CLT,18:56,-8,22:30,-7,...,21803.0,0.0,0.0,76.0,60.666667,6.0,673.0,1.0,2.0,F


### Removing columns that were added along the way

In [23]:
flights_sample = flights_sample.drop(columns=['arrivals_busyness_id', 'departure_busyness_id', 'routeid'])

### Reordering the columns

In [24]:
#reorder the columns of flights_sample
flights_sample = flights_sample[['Flight Weekday', 'Flight Day', 'Flight Month', 'Flight Year', 'Marketer - Unique Carrier Code', 'Operator - Unique Carrier Code', 'Tail Number', 'Different Marketer & Operator Carrier Code', 'distanceinterval_x500mi', 'Scheduled Elapsed Time', 'Distance (miles)', 'Flight Number', 'averagepayload_lbs', 'averagefreight_lbs', 'averagemail_lbs', 'availableseats', 'averagepassengers', 'aircraftgroup', 'aircrafttype', 'aircraftconfiguration', 'serviceclass', 'Origin Airport (IATA Code)', 'Scheduled Departure Time (local time)', 'TAVG (*C)_dep', 'TMAX (*C)_dep', 'TMIN (*C)_dep', 'PRCP (mm)_dep', 'SNOW (mm)_dep', 'Fog_dep', 'Heavy_Fog_dep', 'Thunder_dep', 'Ice_Pellets_dep', 'Hail_dep', 'Glaze_or_Rime_dep', 'Dust_or_Sand_dep', 'Smoke_or_Haze_dep', 'Blowing or Drifting Snow_dep', 'Tornado_or_Funnel_Cloud_dep', 'High_or_Damaging_Winds_dep', 'elevation_ft_dep', 'latitude_deg_dep', 'longitude_deg_dep', 'type_dep', 'local_region_dep', 'Departure Busyness Score', 'Destination Airport (IATA Code)', 'Scheduled Arrival Time (local time)', 'TAVG (*C)_arr', 'TMAX (*C)_arr', 'TMIN (*C)_arr', 'PRCP (mm)_arr', 'SNOW (mm)_arr', 'Fog_arr', 'Heavy_Fog_arr', 'Thunder_arr', 'Ice_Pellets_arr', 'Hail_arr', 'Glaze_or_Rime_arr', 'Dust_or_Sand_arr', 'Smoke_or_Haze_arr', 'Blowing or Drifting Snow_arr', 'Tornado_or_Funnel_Cloud_arr', 'High_or_Damaging_Winds_arr', 'latitude_deg_arr', 'longitude_deg_arr', 'elevation_ft_arr', 'type_arr', 'local_region_arr', 'Arrivals Busyness Score', 'Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)']]

In [25]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150186 entries, 0 to 150185
Data columns (total 76 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Flight Weekday                              150186 non-null  int64  
 1   Flight Day                                  150186 non-null  int64  
 2   Flight Month                                150186 non-null  int64  
 3   Flight Year                                 150186 non-null  int64  
 4   Marketer - Unique Carrier Code              150186 non-null  object 
 5   Operator - Unique Carrier Code              150186 non-null  object 
 6   Tail Number                                 150186 non-null  object 
 7   Different Marketer & Operator Carrier Code  150186 non-null  int64  
 8   distanceinterval_x500mi                     145468 non-null  float64
 9   Scheduled Elapsed Time                      150186 non-null  int64  
 

## 3.2 - Variable Transformation

### Modify

#### We'll change Freight and Mail to their proportion of the payload

In [26]:
flights_sample['Proportion of freight to the payload'] = flights_sample['averagefreight_lbs'] / flights_sample['averagepayload_lbs']
flights_sample['Proportion of mail to the payload'] = flights_sample['averagemail_lbs'] / flights_sample['averagepayload_lbs']

#### We'll change Seats to the proportion of filled seats

In [27]:
flights_sample['Proportion of filled seats'] = flights_sample['averagepassengers'] / flights_sample['availableseats']

#### We'll keep only the hour of the Schedules Departure Time (to help group)

In [28]:
flights_sample["Hour of departure"] = flights_sample["Scheduled Departure Time (local time)"].str.split(":").str[0].astype(int)
flights_sample["Hour of arrival"] = flights_sample["Scheduled Arrival Time (local time)"].str.split(":").str[0].astype(int)

#### We'll switch the Average Temperature to a binary value

In [29]:
#we'll switch avg temp to a binary value (1 if over 30, 0 for the rest)
flights_sample["Departure Airport - Avg Temp over 30C"] = flights_sample["TAVG (*C)_dep"].apply(lambda x: 1 if x > 30 else 0)
flights_sample["Arrival Airport - Avg Temp over 30C"] = flights_sample["TAVG (*C)_arr"].apply(lambda x: 1 if x > 30 else 0)

### Drop

In [58]:
flights_sample.drop('averagefreight_lbs', axis=1, inplace=True) 
flights_sample.drop('averagemail_lbs', axis=1, inplace=True) 
flights_sample.drop('averagepassengers', axis=1, inplace=True) 
flights_sample.drop('Scheduled Arrival Time (local time)', axis=1, inplace=True) 
flights_sample.drop('Scheduled Departure Time (local time)', axis=1, inplace=True) 

flights_sample.drop('Flight Year', axis=1, inplace=True) 
flights_sample.drop('Marketer - Unique Carrier Code', axis=1, inplace=True) 
#flights_sample.drop('Operator - Unique Carrier Code', axis=1, inplace=True)
flights_sample.drop('TAVG (*C)_dep', axis=1, inplace=True) 
flights_sample.drop('TAVG (*C)_arr', axis=1, inplace=True) 

flights_sample.drop('Tail Number', axis=1, inplace=True) 
flights_sample.drop('Distance (miles)', axis=1, inplace=True) 
flights_sample.drop('Flight Number', axis=1, inplace=True) 
flights_sample.drop('Origin Airport (IATA Code)', axis=1, inplace=True) 
flights_sample.drop('TMAX (*C)_dep', axis=1, inplace=True) 
flights_sample.drop('TMIN (*C)_dep', axis=1, inplace=True) 
flights_sample.drop('latitude_deg_dep', axis=1, inplace=True) 
flights_sample.drop('longitude_deg_dep', axis=1, inplace=True) 
flights_sample.drop('local_region_dep', axis=1, inplace=True) 
flights_sample.drop('Destination Airport (IATA Code)', axis=1, inplace=True) 
flights_sample.drop('TMAX (*C)_arr', axis=1, inplace=True) 
flights_sample.drop('TMIN (*C)_arr', axis=1, inplace=True) 
flights_sample.drop('latitude_deg_arr', axis=1, inplace=True) 
flights_sample.drop('longitude_deg_arr', axis=1, inplace=True) 
flights_sample.drop('local_region_arr', axis=1, inplace=True) 

KeyError: "['averagefreight_lbs'] not found in axis"

In [None]:
flights_sample.info()

### Bin

### Scale

In [31]:
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler

In [32]:
scaler = MinMaxScaler()
scaler.fit(flights_sample[['Scheduled Elapsed Time']])
flights_sample['Scheduled Elapsed Time'] = scaler.transform(flights_sample[['Scheduled Elapsed Time']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['averagepayload_lbs']])
flights_sample['averagepayload_lbs'] = scaler.transform(flights_sample[['averagepayload_lbs']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['PRCP (mm)_dep']])
flights_sample['PRCP (mm)_dep'] = scaler.transform(flights_sample[['PRCP (mm)_dep']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['SNOW (mm)_dep']])
flights_sample['SNOW (mm)_dep'] = scaler.transform(flights_sample[['SNOW (mm)_dep']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['elevation_ft_dep']])
flights_sample['elevation_ft_dep'] = scaler.transform(flights_sample[['elevation_ft_dep']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['Departure Busyness Score']])
flights_sample['Departure Busyness Score'] = scaler.transform(flights_sample[['Departure Busyness Score']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['PRCP (mm)_arr']])
flights_sample['PRCP (mm)_arr'] = scaler.transform(flights_sample[['PRCP (mm)_arr']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['SNOW (mm)_arr']])
flights_sample['SNOW (mm)_arr'] = scaler.transform(flights_sample[['SNOW (mm)_arr']])

scaler = MinMaxScaler()
scaler.fit(flights_sample[['elevation_ft_arr']])
flights_sample['elevation_ft_arr'] = scaler.transform(flights_sample[['elevation_ft_arr']])

### Dummy Variables

In [33]:
flights_sample = pd.get_dummies(flights_sample, columns=['Operator - Unique Carrier Code'], drop_first=False)
flights_sample = pd.get_dummies(flights_sample, columns=['serviceclass'], drop_first=False)
flights_sample = pd.get_dummies(flights_sample, columns=['Hour of departure'], drop_first=False)
flights_sample = pd.get_dummies(flights_sample, columns=['Hour of arrival'], drop_first=False)
flights_sample = pd.get_dummies(flights_sample, columns=['type_arr'], drop_first=False)
flights_sample = pd.get_dummies(flights_sample, columns=['type_dep'], drop_first=False)

### Rename

In [34]:
flights_sample.rename(columns={'distanceinterval_x500mi': 'Distance Interval (x500 mi)'}, inplace=True)
flights_sample.rename(columns={'Scheduled Elapsed Time': 'Scheduled Flight Time'}, inplace=True)
flights_sample.rename(columns={'averagepayload_lbs': 'Average Payload (lbs)'}, inplace=True)
flights_sample.rename(columns={'averagefreight_lbs': 'Average Freight (lbs)'}, inplace=True)
flights_sample.rename(columns={'averagemail_lbs': 'Average Mail (lbs)'}, inplace=True)
flights_sample.rename(columns={'availableseats': 'Available Seats (avg)'}, inplace=True)
flights_sample.rename(columns={'averagepassengers': 'Filled Seats (avg)'}, inplace=True)
flights_sample.rename(columns={'aircraftgroup': 'Aircraft Group'}, inplace=True)
flights_sample.rename(columns={'aircrafttype': 'Aircraft Type'}, inplace=True)
flights_sample.rename(columns={'aircraftconfiguration': 'Aircraft Configuration'}, inplace=True)
flights_sample.rename(columns={'serviceclass': 'Service Class'}, inplace=True)
flights_sample.rename(columns={'Scheduled Departure Time (local time)': 'Scheduled Departure Hour'}, inplace=True)
flights_sample.rename(columns={'TAVG (*C)_dep': 'Departure Airport - Average Daily Temperature (*C)'}, inplace=True)
flights_sample.rename(columns={'PRCP (mm)_dep': 'Departure Airport - Precipitation (mm)'}, inplace=True)
flights_sample.rename(columns={'SNOW (mm)_dep': 'Departure Airport - Snow Fall (mm)'}, inplace=True)
flights_sample.rename(columns={'Fog_dep': 'Departure Airport - Fog'}, inplace=True)
flights_sample.rename(columns={'Heavy_Fog_dep': 'Departure Airport - Heavy Fog'}, inplace=True)
flights_sample.rename(columns={'Thunder_dep': 'Departure Airport - Thunder'}, inplace=True)
flights_sample.rename(columns={'Ice_Pellets_dep': 'Departure Airport - Ice Pellets'}, inplace=True)
flights_sample.rename(columns={'Hail_dep': 'Departure Airport - Hail'}, inplace=True)
flights_sample.rename(columns={'Glaze_or_Rime_dep': 'Departure Airport - Glaze or Rime'}, inplace=True)
flights_sample.rename(columns={'Dust_or_Sand_dep': 'Departure Airport - Dust/Sand'}, inplace=True)
flights_sample.rename(columns={'Smoke_or_Haze_dep': 'Departure Airport - Snow / Haze'}, inplace=True)
flights_sample.rename(columns={'Blowing or Drifting Snow_dep': 'Departure Airport - Blowing/Dirfting Snow'}, inplace=True)
flights_sample.rename(columns={'Tornado_or_Funnel_Cloud_dep': 'Departure Airport - Tornado / Funnel Cloud'}, inplace=True)
flights_sample.rename(columns={'High_or_Damaging_Winds_dep': 'Departure Airport - High / Damaging Winds'}, inplace=True)
flights_sample.rename(columns={'elevation_ft_dep': 'Departure Airport - Elevation (ft)'}, inplace=True)
flights_sample.rename(columns={'TAVG (*C)_arr': 'Arrival Airport - Average Daily Temperature (*C)'}, inplace=True)
flights_sample.rename(columns={'PRCP (mm)_arr': 'Arrival Airport - Precipitation (mm)'}, inplace=True)
flights_sample.rename(columns={'SNOW (mm)_arr': 'Arrival Airport - Snow Fall (mm)'}, inplace=True)
flights_sample.rename(columns={'Fog_arr': 'Arrival Airport - Fog'}, inplace=True)
flights_sample.rename(columns={'Heavy_Fog_arr': 'Arrival Airport - Heavy Fog'}, inplace=True)
flights_sample.rename(columns={'Thunder_arr': 'Arrival Airport - Thunder'}, inplace=True)
flights_sample.rename(columns={'Ice_Pellets_arr': 'Arrival Airport - Ice Pellets'}, inplace=True)
flights_sample.rename(columns={'Hail_arr': 'Arrival Airport - Hail'}, inplace=True)
flights_sample.rename(columns={'Glaze_or_Rime_arr': 'Arrival Airport - Glaze or Rime'}, inplace=True)
flights_sample.rename(columns={'Dust_or_Sand_arr': 'Arrival Airport - Dust/Sand'}, inplace=True)
flights_sample.rename(columns={'Smoke_or_Haze_arr': 'Arrival Airport - Snow / Haze'}, inplace=True)
flights_sample.rename(columns={'Blowing or Drifting Snow_arr': 'Arrival Airport - Blowing/Dirfting Snow'}, inplace=True)
flights_sample.rename(columns={'Tornado_or_Funnel_Cloud_arr': 'Arrival Airport - Tornado / Funnel Cloud'}, inplace=True)
flights_sample.rename(columns={'High_or_Damaging_Winds_arr': 'Arrival Airport - High / Damaging Winds'}, inplace=True)
flights_sample.rename(columns={'elevation_ft_arr': 'Arrival Airport - Elevation (ft)'}, inplace=True)
#flights_sample.rename(columns={'type_dep': 'Departure Airport - Type'}, inplace=True)
#flights_sample.rename(columns={'type_arr': 'Arrival Airport - Type'}, inplace=True)

### Handling NaNs (To be completed)

In [35]:
#dataCleaning(flights_sample, code=True, tips=False, orientation=False, formatIssues=False, missingValues=True, duplicateValues=False, outliers=False)

In [36]:
#5920 missing values - Fill with zeros (switch to comment to drop)
flights_sample['Distance Interval (x500 mi)'].fillna(0, inplace=True)
flights_sample['Aircraft Group'].fillna(0, inplace=True) 
flights_sample['Aircraft Type'].fillna(0, inplace=True) 
flights_sample['Aircraft Configuration'].fillna(0, inplace=True) 

# 1 missing value from payload which affects freight and mail (switch to comment to drop)
# flights_sample['Average Payload (lbs)'].fillna(0, inplace=True) 
# flights_sample['Proportion of freight to the payload'].fillna(0, inplace=True) 
# flights_sample['Proportion of mail to the payload'].fillna(0, inplace=True) 

# Proportion of filled seats : 223  missing values  (switch to comment to drop)
# flights_sample['Available Seats (avg)'].fillna(0, inplace=True) 

# Missing values for Precipitation (mm) ~ 2300  missing values each. Replace with 0 (switch to comment to drop)
flights_sample['Departure Airport - Precipitation (mm)'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Precipitation (mm)'].fillna(0, inplace=True) 

# Missing values for Snow Fall (mm) : ~ 14000  missing values. Replace with 0 (switch to comment to drop)
flights_sample['Departure Airport - Snow Fall (mm)'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Snow Fall (mm)'].fillna(0, inplace=True) 

# Missing values for Elevation (ft) : ~40  missing values each. 
## Further investigate ##################################################################################################################################################################
## For now, dropping these rows just to test the mode

# Binary columns that have NaN instead of 0
flights_sample['Departure Airport - Fog'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Heavy Fog'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Thunder'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Ice Pellets'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Hail'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Glaze or Rime'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Dust/Sand'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Snow / Haze'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Blowing/Dirfting Snow'].fillna(0, inplace=True) 
flights_sample['Departure Airport - Tornado / Funnel Cloud'].fillna(0, inplace=True) 
flights_sample['Departure Airport - High / Damaging Winds'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Fog'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Heavy Fog'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Thunder'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Ice Pellets'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Hail'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Glaze or Rime'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Dust/Sand'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Snow / Haze'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Blowing/Dirfting Snow'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - Tornado / Funnel Cloud'].fillna(0, inplace=True) 
flights_sample['Arrival Airport - High / Damaging Winds'].fillna(0, inplace=True) 

# Keep at the end. Drop the remaining NANs
flights_sample.dropna(inplace=True) # drop all remaining rows with missing values

In [37]:
correlations = flights_sample.corr()

In [38]:
correlations.shape

(138, 138)

In [39]:
correlations.to_csv('../../data/processed/flights_sample_corr_matrix.csv')

# STEP 4 - Dimension Reduction & Variable Selection

In [40]:
pd.set_option("display.max_columns", 120)
flights_sample.head()

Unnamed: 0,Flight Weekday,Flight Day,Flight Month,Different Marketer & Operator Carrier Code,Distance Interval (x500 mi),Scheduled Flight Time,Average Payload (lbs),Available Seats (avg),Aircraft Group,Aircraft Type,Aircraft Configuration,Departure Airport - Precipitation (mm),Departure Airport - Snow Fall (mm),Departure Airport - Fog,Departure Airport - Heavy Fog,Departure Airport - Thunder,Departure Airport - Ice Pellets,Departure Airport - Hail,Departure Airport - Glaze or Rime,Departure Airport - Dust/Sand,Departure Airport - Snow / Haze,Departure Airport - Blowing/Dirfting Snow,Departure Airport - Tornado / Funnel Cloud,Departure Airport - High / Damaging Winds,Departure Airport - Elevation (ft),Departure Busyness Score,Arrival Airport - Precipitation (mm),Arrival Airport - Snow Fall (mm),Arrival Airport - Fog,Arrival Airport - Heavy Fog,Arrival Airport - Thunder,Arrival Airport - Ice Pellets,Arrival Airport - Hail,Arrival Airport - Glaze or Rime,Arrival Airport - Dust/Sand,Arrival Airport - Snow / Haze,Arrival Airport - Blowing/Dirfting Snow,Arrival Airport - Tornado / Funnel Cloud,Arrival Airport - High / Damaging Winds,Arrival Airport - Elevation (ft),Arrivals Busyness Score,Arrival Delay (minutes),Departure Delay (minutes),Carrier Delay (minutes),Weather Delay (minutes),National Air System Delay (minutes),Security Delay (minutes),Late Aircraft Delay (minutes),Proportion of freight to the payload,Proportion of mail to the payload,Proportion of filled seats,Departure Airport - Avg Temp over 30C,Arrival Airport - Avg Temp over 30C,Operator - Unique Carrier Code_9E,Operator - Unique Carrier Code_9K,Operator - Unique Carrier Code_AA,Operator - Unique Carrier Code_AS,Operator - Unique Carrier Code_AX,Operator - Unique Carrier Code_B6,Operator - Unique Carrier Code_C5,...,Operator - Unique Carrier Code_YV,Operator - Unique Carrier Code_YX,Operator - Unique Carrier Code_ZW,serviceclass_F,serviceclass_G,serviceclass_L,Hour of departure_0,Hour of departure_1,Hour of departure_2,Hour of departure_3,Hour of departure_4,Hour of departure_5,Hour of departure_6,Hour of departure_7,Hour of departure_8,Hour of departure_9,Hour of departure_10,Hour of departure_11,Hour of departure_12,Hour of departure_13,Hour of departure_14,Hour of departure_15,Hour of departure_16,Hour of departure_17,Hour of departure_18,Hour of departure_19,Hour of departure_20,Hour of departure_21,Hour of departure_22,Hour of departure_23,Hour of arrival_0,Hour of arrival_1,Hour of arrival_2,Hour of arrival_3,Hour of arrival_4,Hour of arrival_5,Hour of arrival_6,Hour of arrival_7,Hour of arrival_8,Hour of arrival_9,Hour of arrival_10,Hour of arrival_11,Hour of arrival_12,Hour of arrival_13,Hour of arrival_14,Hour of arrival_15,Hour of arrival_16,Hour of arrival_17,Hour of arrival_18,Hour of arrival_19,Hour of arrival_20,Hour of arrival_21,Hour of arrival_22,Hour of arrival_23,type_arr_large_airport,type_arr_medium_airport,type_arr_small_airport,type_dep_large_airport,type_dep_medium_airport,type_dep_small_airport
0,3,7,2,1,1.0,0.058824,0.051734,49.983051,6.0,638.0,1.0,0.033554,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.002175,0.09368,0.04723,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.092747,0.961907,-1,20,0,0,0,0,0,0.0,0.0,0.599525,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,5,28,12,1,2.0,0.073691,0.12999,76.0,6.0,673.0,1.0,0.000623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001151,0.094101,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.095305,0.902345,-7,-8,0,0,0,0,0,0.0,0.0,0.798246,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0
2,4,1,3,0,1.0,0.039431,0.221656,123.936364,6.0,655.0,1.0,0.002461,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005501,0.087927,0.021583,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130869,1.061627,1,1,0,0,0,0,0,0.001125,0.0,0.818969,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,3,4,10,0,1.0,0.02521,0.252965,150.0,6.0,698.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055264,0.102166,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039529,1.064403,8,5,0,0,0,0,0,0.000558,0.0,0.8008,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
4,4,26,4,1,1.0,0.029089,0.094635,67.0,6.0,631.0,1.0,0.002534,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276577,0.086351,0.001418,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095305,1.036427,11,11,0,0,0,0,0,0.0,0.0,0.89475,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [41]:
flights_sample.to_csv('../../data/processed/flights_enriched.csv',index=False) 

## OLS

Let's see our values in comparison with our different delays

In [42]:
def getOLS(x,y):
    """
    This function will provide you with OLS Regression Results for your dataset.

    Recommended nomenclature for x:
    df.drop(['y_column'], axis = 1) 
    
    Start with your entire DF minus your y value. 
    You can then easily add values here as you drop them.

    Recommended nomenclature for y:
    df['y_column']
    """
    import pandas as pd
    import statsmodels.api as sm
    
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    predictions = model.predict(x)
    print_model = model.summary()

    print(print_model)

In [43]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149936 entries, 0 to 150185
Columns: 138 entries, Flight Weekday to type_dep_small_airport
dtypes: float64(40), int64(13), uint8(85)
memory usage: 73.9 MB


In [44]:
### Values for Arrival
y = flights_sample['Arrival Delay (minutes)']

## All the other delays (R-Squared of 79%)
# X = flights_sample[['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)']]

## Only the delays we're the most likely to be able to somewhat predict (R-Squared 69.9%)
# X = flights_sample[['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)']]

## What if we take Departure Delay out? (R-Squared 36.7%).. Carrier Delay accounts for a big chunk of our accuracy
# X = flights_sample[['Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)']]

##-----------------------------------------------------------------

## Are there any features that could directly influence Arrival? Like distance for example?
#X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Returned 3%
## Removing columns with a high P_Value
# X = flights_sample.drop(['Flight Day', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Configuration', 'Departure Airport - Average Daily Temperature (*C)', 'Departure Airport - Ice Pellets', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Dust/Sand', 'Departure Airport - Blowing/Dirfting Snow', 'Departure Airport - Tornado / Funnel Cloud', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Hail', 'Arrival Airport - Dust/Sand', 'Arrival Airport - Snow / Haze', 'Arrival Airport - Blowing/Dirfting Snow', 'Proportion of freight to the payload', 'type_arr_medium_airport', 'type_arr_small_airport', 'Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Still 3%
## Removing columns with a high P_Value
# X = flights_sample.drop(['Departure Airport - High / Damaging Winds', 'Arrival Airport - Heavy Fog', 'Arrival Airport - Tornado / Funnel Cloud', 'serviceclass_G', 'Flight Day', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Configuration', 'Departure Airport - Average Daily Temperature (*C)', 'Departure Airport - Ice Pellets', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Dust/Sand', 'Departure Airport - Blowing/Dirfting Snow', 'Departure Airport - Tornado / Funnel Cloud', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Hail', 'Arrival Airport - Dust/Sand', 'Arrival Airport - Snow / Haze', 'Arrival Airport - Blowing/Dirfting Snow', 'Proportion of freight to the payload', 'type_arr_medium_airport', 'type_arr_small_airport', 'Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Still 3%.. all vlaues with low P-Value.. this is unsatisfactry

X = flights_sample[['Flight Weekday', 'Flight Day', 'Flight Month', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Average Payload (lbs)', 'Available Seats (avg)', 'Departure Airport - Precipitation (mm)', 'Departure Airport - Snow Fall (mm)', 'Departure Airport - Fog', 'Departure Airport - Heavy Fog', 'Departure Airport - Thunder', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Snow / Haze', 'Departure Airport - High / Damaging Winds', 'Departure Busyness Score', 'Arrival Airport - Precipitation (mm)', 'Arrival Airport - Snow Fall (mm)', 'Arrival Airport - Fog', 'Arrival Airport - Heavy Fog', 'Arrival Airport - Thunder', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Glaze or Rime', 'Arrival Airport - Blowing/Dirfting Snow', 'Arrival Airport - Tornado / Funnel Cloud', 'Arrival Airport - High / Damaging Winds', 'Arrival Airport - Elevation (ft)', 'Arrivals Busyness Score', 'Proportion of filled seats', 'Arrival Airport - Avg Temp over 30C', 'Operator - Unique Carrier Code_9K', 'Operator - Unique Carrier Code_AA', 'Operator - Unique Carrier Code_AS', 'Operator - Unique Carrier Code_AX', 'Operator - Unique Carrier Code_B6', 'Operator - Unique Carrier Code_C5', 'Operator - Unique Carrier Code_CP', 'Operator - Unique Carrier Code_DL', 'Operator - Unique Carrier Code_F9', 'Operator - Unique Carrier Code_G4', 'Operator - Unique Carrier Code_HA', 'Operator - Unique Carrier Code_MQ', 'Operator - Unique Carrier Code_NK', 'Operator - Unique Carrier Code_OH', 'Operator - Unique Carrier Code_UA', 'Operator - Unique Carrier Code_VX', 'Operator - Unique Carrier Code_WN', 'Operator - Unique Carrier Code_YV', 'Operator - Unique Carrier Code_YX', 'Operator - Unique Carrier Code_ZW', 'serviceclass_G', 'serviceclass_L', 'Hour of departure_0', 'Hour of departure_1', 'Hour of departure_2', 'Hour of departure_3', 'Hour of departure_4', 'Hour of departure_5', 'Hour of departure_6', 'Hour of departure_7', 'Hour of departure_10', 'Hour of departure_11', 'Hour of departure_12', 'Hour of departure_13', 'Hour of departure_14', 'Hour of departure_15', 'Hour of departure_16', 'Hour of departure_17', 'Hour of departure_18', 'Hour of departure_19', 'Hour of departure_21', 'Hour of departure_23', 'Hour of arrival_0', 'Hour of arrival_2', 'Hour of arrival_4', 'Hour of arrival_5', 'Hour of arrival_6', 'Hour of arrival_7', 'Hour of arrival_9', 'Hour of arrival_10', 'Hour of arrival_11', 'Hour of arrival_12', 'Hour of arrival_14', 'Hour of arrival_15', 'Hour of arrival_16', 'Hour of arrival_17', 'Hour of arrival_18', 'Hour of arrival_19', 'Hour of arrival_20', 'Hour of arrival_21', 'Hour of arrival_22', 'Hour of arrival_23', 'type_arr_large_airport', 'type_arr_medium_airport', 'type_dep_medium_airport', 'type_dep_small_airport']]

getOLS(X,y)

                               OLS Regression Results                              
Dep. Variable:     Arrival Delay (minutes)   R-squared:                       0.051
Model:                                 OLS   Adj. R-squared:                  0.050
Method:                      Least Squares   F-statistic:                     82.98
Date:                     Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                             13:23:28   Log-Likelihood:            -6.6657e+05
No. Observations:                   149936   AIC:                         1.333e+06
Df Residuals:                       149838   BIC:                         1.334e+06
Df Model:                               97                                         
Covariance Type:                 nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------

In [45]:
### Values for Departure =======================================================================================================
y = flights_sample['Departure Delay (minutes)']
#X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  



##old try
#X = flights_sample[['Flight Weekday', 'Flight Day', 'Different Marketer & Operator Carrier Code', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Group', 'Aircraft Type', 'Aircraft Configuration', 'Departure Airport - Elevation (ft)', 'Departure Busyness Score', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Hour of departure', 'serviceclass_F', 'serviceclass_G', 'serviceclass_L', 'type_dep_medium_airport', 'type_dep_small_airport']]
#X = flights_sample[['Flight Weekday', 'Different Marketer & Operator Carrier Code', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Aircraft Group', 'Aircraft Type', 'Departure Busyness Score', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Hour of departure', 'serviceclass_F', 'serviceclass_G', 'serviceclass_L', 'type_dep_medium_airport']]

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:29   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [46]:
### Values for Carrier =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Carrier Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:31   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [47]:
### Values for Weather =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Weather Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:33   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [48]:
### Values for National Air System =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['National Air System Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:35   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [49]:
### Values for Security =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Security Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:36   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [50]:
### Values for Late Aircraft =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Late Aircraft Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:38   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [51]:
### Values for Arrival (with all the other delays as X) =======================================================================================================
# X = flights_sample['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)']
# y = flights_sample['Arrival Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:40   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

## PCA

In [52]:
def runPCA (data, componentsNumber):
    # Import required librairies
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import scale

    # Set data to A
    A = data

    # Scale the data
    A = scale(A, with_std=True)  #Std scaler
    ## can also use a min/max scaler

    # Run a PCA over the data
    pca = PCA(n_components=componentsNumber) #change to the number of components you want to keep
    pca.fit(A)
    A_pca = pca.transform(A)

    return A_pca

In [53]:
X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  

X_pca = runPCA(X, 0.95) #adapt to the level we want (depends of OLS?)

X_pca

array([[ 1.93880596, -0.9822855 , -0.43437029, ..., -0.03172289,
         4.04985247, -1.49347936],
       [ 1.64274599, -1.33996216,  2.1624808 , ..., -0.127034  ,
         0.30045762,  0.27729109],
       [ 0.32612314, -0.45819705,  2.3849544 , ...,  1.14755703,
        -0.31814281, -0.14716974],
       ...,
       [-0.28375089,  3.30072782, -0.03711485, ...,  0.30962126,
         0.21757796,  0.60226291],
       [-1.98642651,  0.68128285, -0.1667396 , ...,  0.22551142,
        -0.22374396, -0.32946884],
       [-1.56006844,  0.35950036,  0.71698232, ..., -0.52137584,
         0.39856498,  0.5690358 ]])

# ML Models

In [54]:
# !pip install xgboost

In [55]:
#import packages
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

### Predict Arrival Delays

In [56]:
# arrival delay - B_A
#Create variables and dfs
B_A_X = [3, 7, 11, 12, 13, 15, 26, 28, 30, 61, 77, 94, 135, 136]
B_A_y = 41

B_A_X_df = np.array(flights_sample.iloc[:,B_A_X])
B_A_y_df = np.array(flights_sample.iloc[:,B_A_y])

#### Arrival - Try Linear and Polynomial Regression Models

In [57]:
print("# of features used:", B_A_X_df.shape[1])

#Try Linear Regression
B_A_model = LinearRegression()
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model.predict(B_A_X_df)
print("linear regression score: ", B_A_model.score(B_A_X_df, B_A_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_A_X_df, B_A_y_df)
    y_pred = ridge.predict(B_A_X_df)
    scores.append(ridge.score(B_A_X_df, B_A_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_A_X_df, B_A_y_df)
    lasso_pred = lasso.predict(B_A_X_df)
    scores.append(lasso.score(B_A_X_df, B_A_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 2
B_A_model = PolynomialFeatures(degree=degree, include_bias=False)
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model.predict(B_A_X_df)
print("polynomial regression score: ", B_A_model.score(B_A_X_df, B_A_y_df), ", degree: ", degree)
B_A_model.score(B_A_X_df, B_A_y_df)

# of features used: 14
linear regression score:  0.03203470811760567
ridge regression scores:  [0.03203468719802316, 0.03203267754770933, 0.031880372197001794, 0.028994421341863474, 0.02473164555290519]
lasso regression scores:  [0.030483922955258258, 3.0300136875105466e-06, 0.0, 0.0, 0.0]


AttributeError: 'PolynomialFeatures' object has no attribute 'predict'

#### Arrival - Try XGBoost Model

In [None]:
B_A_model = xgb.XGBRegressor(objective='reg:linear', m_estimators=10, seed=123)
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model(B_A_X_df)
B_A_model.score(B_A_X_df, B_A_y_df)

### Predict Departure Delays

In [None]:
#Departure Predictions
  
#B_D - departure -  create variables to create dataframes to use in model. 
B_D_X = [0, 1, 2, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 136, 137]
B_D_y = 42

B_D_X_df = np.array(flights_sample.iloc[:,B_D_X])
B_D_y_df = np.array(flights_sample.iloc[:,B_D_y])    

In [None]:
#Try Linear Regression
B_D_model = LinearRegression()
B_D_model.fit(B_D_X_df, B_D_y_df)
B_D_predictions = B_D_model.predict(B_D_X_df)
print("linear regression score: ", B_D_model.score(B_D_X_df, B_D_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_D_X_df, B_D_y_df)
    y_pred = ridge.predict(B_D_X_df)
    scores.append(ridge.score(B_D_X_df, B_D_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_D_X_df, B_D_y_df)
    lasso_pred = lasso.predict(B_D_X_df)
    scores.append(lasso.score(B_D_X_df, B_D_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 3
B_D_X_model = PolynomialFeatures(degree=degree, include_bias=False)
B_D_model.fit(B_D_X_df, B_D_y_df)
B_D_predictions = B_D_model.predict(B_D_X_df)
print("polynomial regression score: ", B_D_model.score(B_D_X_df, B_D_y_df), "degree: ", degree)

### Predict Other Delay Types

In [None]:
# carrier delay - B_C
#Create variables and dfs
B_C_X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137]
B_C_y = 43

B_C_X_df = flights_sample.iloc[:,B_C_X]
B_C_y_df = flights_sample.iloc[:,B_C_y]

In [None]:
print("# of features used:", B_C_X_df.shape[1])

#Try Linear Regression
B_C_model = LinearRegression()
B_C_model.fit(B_C_X_df, B_C_y_df)
B_C_predictions = B_C_model.predict(B_C_X_df)
print("linear regression score: ", B_C_model.score(B_C_X_df, B_C_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_C_X_df, B_C_y_df)
    y_pred = ridge.predict(B_C_X_df)
    scores.append(ridge.score(B_C_X_df, B_C_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_C_X_df, B_C_y_df)
    lasso_pred = lasso.predict(B_C_X_df)
    scores.append(lasso.score(B_C_X_df, B_C_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 2
B_C_X_model = PolynomialFeatures(degree=degree, include_bias=False)
B_C_model.fit(B_C_X_df, B_C_y_df)
B_C_predictions = B_C_model.predict(B_C_X_df)
print("polynomial regression score: ", B_C_model.score(B_C_X_df, B_C_y_df), ", degree: ", degree)

In [None]:
# weather delay - B_W
B_W_X = [11, 12, 13, 15, 21, 30]
B_W_y = 44

B_W_X_df = flights_sample.iloc[:,B_W_X]
B_W_y_df = flights_sample.iloc[:,B_W_y]

In [None]:
# NAS delay - B_N
B_N_X = [4, 5, 11, 12, 13, 14, 15, 16, 18, 21, 26, 27, 28, 29, 30, 33, 35, 39, 59, 67, 70, 75, 77, 79, 94, 132, 133]
B_N_y = 45

B_N_X_df = flights_sample.iloc[:,B_N_X]
B_N_y_df = flights_sample.iloc[:,B_N_y]

In [None]:
B_L_X = [4, 5, 6, 7, 11, 12, 13, 14, 15, 25, 26, 27, 28, 30, 61, 77, 88, 89, 90, 91, 94, 96, 97, 99, 114, 117, 119, 120, 123, 124, 125, 126]
B_L_y = 47

B_L_X_df = flights_sample.iloc[:,B_L_X]
B_L_y_df = flights_sample.iloc[:,B_L_y]

# Export to CSV

In [None]:
# if trainingData == True:
#     flights_sample.to_csv('../../data/processed/flights_enriched.csv',index=False) #Training
# else:
#     flights_sample.to_csv('../../data/processed/flights_test_enriched.csv',index=False) #Testing