# Data Cleaning Code Generation

In [1]:
def dataCleaning(df, code=True, tips=False, orientation=True, formatIssues=True, missingValues=True, duplicateValues=True, outliers=True):
    """
    ------------------
    Consolidation of the usual data cleaning steps for a df, November 2022
    Made by Sebastien Garneau, sebastien.garneau@gmail.com
    ------------------
    df: your dataframe

    code: A text template to note your observations as you go. Use the code snippets included in the output. copy-paste into vscode/notepad

    tips: Provides snippets of code to help you clean potential issues in your df. If you prefer this to code
    
    orientation: Provides information about the shape/objects of your data
    
    formatIssues: Provides detailed information on each column to help identify format issues
    
    missingValues: Provides information on missing values
    
    duplicateValues: Provides information on duplicate values
    
    outliers: Provides information on outliers
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

    if code==True:
        print("### CLEANING CODE:")
        print("df = dfX #Change to your df's name")
        print()
        print("#### Change column value:")
        print()
        print()
        print("#### Drop entire column:")
        print()
        print()
        print("#### Change column type:")
        print()
        print()
        print("#### Change column name:")
        print()
        print()
        print("#### Handle missing values:")
        print()
        print()
        print("#### Handle duplicate values:")
        print("# df.drop_duplicates(inplace=True) # drop ALL duplicate rows")
        print()
        print("#### Drop outliers:")
        print()
        print()
        print("#### Other observations / further investigations:")
        print("#")
        print("#")
        print("#")
        print()
        print("df.head() #Final Review")
        print("# dfX = df #Change to your df's name")
        print()
        print("=========================================")
    
    if orientation==True:
        print("ORIENTATION")
        print(df.info())
        print("=========================================")
        print()
        
    
    if formatIssues==True:
        print("FORMAT ISSUES")
        print()
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'datetime64':
                print("df.rename(columns={'" + col + "': ''}, inplace=True)", "#rename column")
                print("df['" + col + "'] = df['" + col + "'].replace('old_value', 'new_value')")
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print("df.drop('" + col + "', axis=1, inplace=True)")                
                pd.set_option('display.max_rows', None)
                print(df.groupby(col, sort=True).size())
                pd.reset_option('display.max_rows')
                #display the dtypes of the column
                print("Current Column DType: ", df[col].dtype, "     Do not compare with above. This one will always return int64 as it's the dtype of the count")                
                print("df['" + col + "'] = df['" + col + "'].astype('new_type') # new_type can be int64, float64, object, category, datetime64")
                print()
            #else:
            #    print(col)
            #    print(df[col].describe())
            #    print()

        if tips==True:
            print("TIPS")
            print("To make a correction to a column, use the following syntax:")
            print("df['A'] = df['A'].apply(lambda x: x.replace('old_value', 'new_value'))")
            print()
            print("To change a column data type, use the following syntax:")
            print("df['A'] = pd.to_datetime(df['A']) # for datetime")
            print("df['A'] = df['A'].astype('int64') # for integers")
            print("df['A'] = df['A'].astype('float64') # for floats")
            print("df['A'] = df['A'].astype('category') # for categorical")
            print("df['A'] = df['A'].astype('object') # for object")
            print()
        print("=========================================")
        print()

    if missingValues==True:
        print("MISSING VALUES")
        print()
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                print(col, ":", df[col].isnull().sum(), " missing values")
                print("df.dropna(subset=['" + col + "'], inplace=True)")
                print("df['" + col + "'].fillna(df['" + col + "'].mean(), inplace=True) #fill NA entries with the mean")
                print("df['" + col + "'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
                print()
                print(df.loc[df[col].isnull()].head())
                print()
            else:
                print(col, ": No missing values")
                print()
                                    
        if tips==True:
            print()
            print("TIPS")
            print("You can drop rows with missing values using one of the following code:")
            print("df.dropna(subset=['col'], inplace=True) #For a single column")
            print("df.dropna(inplace=True) #For all columns")
            print()
            print("You can fill rows with missing values using one of the following code:")
            print("df['col'].fillna(df['col'].mean(), inplace=True) #fill NA entries with the mean")
            print("df['col'].fillna(0, inplace=True) # fill NA entries with a single value, such as zero")
            print("df['col'].fillna(method='ffill') # forward-fill to propagate the previous value forward")
            print("df['col'].fillna(method='bfill' # back-fill to propagate the next values backward)")
            print()
            print("To view them:")
            print("df.loc[df[col].isnull()].head()")
            print()
        print("=========================================")
        print()

    if duplicateValues==True:
        print("DUPLICATE VALUES")
        print()
        print(df[df.duplicated()].head())
        print()

        if tips==True:
            print("TIPS")
            print("You can drop duplicate rows using the following code:")
            print("df.drop_duplicates(inplace=True)")
            print("df.drop_duplicates(subset=['col'], inplace=True) #For a single column")
            print()
            print("To view them:")
            print("df[df.duplicated()].head()")
            print()
    
        print("=========================================")
        print()

    if outliers==True:
        print("OUTLIERS")
        print()
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                print(col)
                print("-----")
                print("Outlier(s):")
                print("Below ", df[col].mean() - 3*df[col].std(), " -> ", df[df[col] < df[col].mean() - 3*df[col].std()].shape[0], " low outlier(s)")
                print("Above ", df[col].mean() + 3*df[col].std(), " -> ", df[df[col] > df[col].mean() + 3*df[col].std()].shape[0], " high outlier(s)")
                low = df[col].mean() - 3*df[col].std()
                high = df[col].mean() + 3*df[col].std()
                print("df = df[(df['" + col + "'] > " + str(low) + ") & (df['" + col + "'] < " + str(high) + ")]")
                print()
                print(df[col].describe())
                print()
                print("Boxplot")
                sns.boxplot(df[col])
                plt.show()
                print()
                print("Histogram")
                sns.histplot(df[col])
                plt.show()
                print("=========================================")
                print()

        if tips==True:
            print("TIPS")
            print("You can drop outliers using the following code:")
            print("df = df[(df['column'] > lower_bound) & (df['column'] < upper_bound)]")
            print()

# Import Dataset & Librairies

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
#import training data
flights_sample = pd.read_csv("../../data/processed/Enriched-flights_sample_train.csv")
trainingData = True #Switch if that's not the case

In [8]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600734 entries, 0 to 600733
Data columns (total 63 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Marketer - Unique Carrier Code              600734 non-null  object 
 1   Operator - Unique Carrier Code              600734 non-null  object 
 2   Different Marketer & Operator Carrier Code  600734 non-null  int64  
 3   Tail Number                                 600734 non-null  object 
 4   Flight Number                               600734 non-null  int64  
 5   Flight Year                                 600734 non-null  int64  
 6   Flight Month                                600734 non-null  int64  
 7   Flight Day                                  600734 non-null  int64  
 8   Flight Weekday                              600734 non-null  int64  
 9   Aircraft group                              600734 non-null  object 
 

In [None]:
correlations = flights_sample.corr()

In [None]:
correlations.shape

(138, 138)

In [39]:
correlations.to_csv('../../data/processed/flights_sample_corr_matrix.csv')

# STEP 4 - Dimension Reduction & Variable Selection

In [9]:
pd.set_option("display.max_columns", 120)
flights_sample.head()

Unnamed: 0,Marketer - Unique Carrier Code,Operator - Unique Carrier Code,Different Marketer & Operator Carrier Code,Tail Number,Flight Number,Flight Year,Flight Month,Flight Day,Flight Weekday,Aircraft group,Aircraft type,Aircraft configuration,Service class,Proportion of freight to the payload,Proportion of mail to the payload,Proportion of filled seats,Average payload (lbs),Average number of available seats,Distance interval (x500mi),Distance (miles),Scheduled Elapsed Time,Origin Airport (IATA Code),Airport Type_origin,Flights Count_origin,Average Flights Per Day_origin,Busyness Score_origin,Scheduled Departure Time (local time),Scheduled hour of departure,Precipitation (mm)_origin,Snowfall (mm)_origin,Maximum Temperature (*C)_origin,Avg Pressure for the day (hPa)_origin,Avg Wind Speed (m/s)_origin,Avg Humidity (%)_origin,Fog_origin,Thunder_origin,Smoke_or_Haze_origin,Destination Airport (IATA Code),Airport Type_destination,Flights Count_destination,Average Flights Per Day_destination,Busyness Score_destination,Scheduled Arrival Time (local time),Scheduled hour of arrival,Precipitation (mm)_destination,Snowfall (mm)_destination,Maximum Temperature (*C)_destination,Avg Pressure for the day (hPa)_destination,Avg Wind Speed (m/s)_destination,Avg Humidity (%)_destination,Fog_destination,Thunder_destination,Smoke_or_Haze_destination,Departure Delay (minutes),Arrival Delay (minutes),Carrier Delay (minutes),Weather Delay (minutes),National Air System Delay (minutes),Security Delay (minutes),Late Aircraft Delay (minutes),cancelled,cancellation_code,diverted
0,DL,9E,1,N909XJ,5011,2019,2,7,3,6.0,638.0,1.0,F,,,0.676929,13250.166667,52.157391,1.0,395,111,BOS,large_airport,419,826.720105,0.506822,11:38,11,15.7,0.0,7.2,1020.0,4.3,96.0,1.0,,1.0,BUF,large_airport,76,159.152431,0.47753,14:49,14,23.1,0.0,15.6,987.5,3.9,100.0,1.0,,,20,-1,0,0,0,0,0,0,,0
1,DL,9E,1,N909XJ,5011,2019,2,7,3,6.0,638.0,1.0,F,,,0.676929,13250.166667,52.157391,1.0,395,111,BOS,large_airport,419,826.720105,0.506822,11:38,11,15.7,0.0,7.2,1020.0,4.3,96.0,1.0,,1.0,BUF,large_airport,77,159.152431,0.483813,14:49,14,23.1,0.0,15.6,987.5,3.9,100.0,1.0,,,20,-1,0,0,0,0,0,0,,0
2,DL,9E,1,N909XJ,5011,2019,2,7,3,6.0,638.0,1.0,F,,,0.676929,13250.166667,52.157391,1.0,395,111,BOS,large_airport,420,826.720105,0.508032,11:38,11,15.7,0.0,7.2,1020.0,4.3,96.0,1.0,,1.0,BUF,large_airport,76,159.152431,0.47753,14:49,14,23.1,0.0,15.6,987.5,3.9,100.0,1.0,,,20,-1,0,0,0,0,0,0,,0
3,DL,9E,1,N909XJ,5011,2019,2,7,3,6.0,638.0,1.0,F,,,0.676929,13250.166667,52.157391,1.0,395,111,BOS,large_airport,420,826.720105,0.508032,11:38,11,15.7,0.0,7.2,1020.0,4.3,96.0,1.0,,1.0,BUF,large_airport,77,159.152431,0.483813,14:49,14,23.1,0.0,15.6,987.5,3.9,100.0,1.0,,,20,-1,0,0,0,0,0,0,,0
4,AA,YX,1,N120HQ,4416,2019,12,28,5,6.0,673.0,1.0,F,,,0.702544,21803.0,76.0,2.0,605,134,HVN,medium_airport,3,5.662286,0.529821,18:56,18,0.5,0.0,10.6,,1.4,,0.0,,,CLT,large_airport,703,1375.864652,0.510951,22:30,22,0.0,0.0,20.6,995.3,1.6,93.0,1.0,,1.0,-8,-7,0,0,0,0,0,0,,0


In [41]:
flights_sample.to_csv('../../data/processed/flights_enriched.csv',index=False) 

## OLS

Let's see our values in comparison with our different delays

In [42]:
def getOLS(x,y):
    """
    This function will provide you with OLS Regression Results for your dataset.

    Recommended nomenclature for x:
    df.drop(['y_column'], axis = 1) 
    
    Start with your entire DF minus your y value. 
    You can then easily add values here as you drop them.

    Recommended nomenclature for y:
    df['y_column']
    """
    import pandas as pd
    import statsmodels.api as sm
    
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    predictions = model.predict(x)
    print_model = model.summary()

    print(print_model)

In [43]:
flights_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149936 entries, 0 to 150185
Columns: 138 entries, Flight Weekday to type_dep_small_airport
dtypes: float64(40), int64(13), uint8(85)
memory usage: 73.9 MB


In [44]:
### Values for Arrival
y = flights_sample['Arrival Delay (minutes)']

## All the other delays (R-Squared of 79%)
# X = flights_sample[['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)']]

## Only the delays we're the most likely to be able to somewhat predict (R-Squared 69.9%)
# X = flights_sample[['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)']]

## What if we take Departure Delay out? (R-Squared 36.7%).. Carrier Delay accounts for a big chunk of our accuracy
# X = flights_sample[['Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)']]

##-----------------------------------------------------------------

## Are there any features that could directly influence Arrival? Like distance for example?
#X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Returned 3%
## Removing columns with a high P_Value
# X = flights_sample.drop(['Flight Day', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Configuration', 'Departure Airport - Average Daily Temperature (*C)', 'Departure Airport - Ice Pellets', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Dust/Sand', 'Departure Airport - Blowing/Dirfting Snow', 'Departure Airport - Tornado / Funnel Cloud', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Hail', 'Arrival Airport - Dust/Sand', 'Arrival Airport - Snow / Haze', 'Arrival Airport - Blowing/Dirfting Snow', 'Proportion of freight to the payload', 'type_arr_medium_airport', 'type_arr_small_airport', 'Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Still 3%
## Removing columns with a high P_Value
# X = flights_sample.drop(['Departure Airport - High / Damaging Winds', 'Arrival Airport - Heavy Fog', 'Arrival Airport - Tornado / Funnel Cloud', 'serviceclass_G', 'Flight Day', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Configuration', 'Departure Airport - Average Daily Temperature (*C)', 'Departure Airport - Ice Pellets', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Dust/Sand', 'Departure Airport - Blowing/Dirfting Snow', 'Departure Airport - Tornado / Funnel Cloud', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Hail', 'Arrival Airport - Dust/Sand', 'Arrival Airport - Snow / Haze', 'Arrival Airport - Blowing/Dirfting Snow', 'Proportion of freight to the payload', 'type_arr_medium_airport', 'type_arr_small_airport', 'Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  
## -> Still 3%.. all vlaues with low P-Value.. this is unsatisfactry

X = flights_sample[['Flight Weekday', 'Flight Day', 'Flight Month', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Average Payload (lbs)', 'Available Seats (avg)', 'Departure Airport - Precipitation (mm)', 'Departure Airport - Snow Fall (mm)', 'Departure Airport - Fog', 'Departure Airport - Heavy Fog', 'Departure Airport - Thunder', 'Departure Airport - Hail', 'Departure Airport - Glaze or Rime', 'Departure Airport - Snow / Haze', 'Departure Airport - High / Damaging Winds', 'Departure Busyness Score', 'Arrival Airport - Precipitation (mm)', 'Arrival Airport - Snow Fall (mm)', 'Arrival Airport - Fog', 'Arrival Airport - Heavy Fog', 'Arrival Airport - Thunder', 'Arrival Airport - Ice Pellets', 'Arrival Airport - Glaze or Rime', 'Arrival Airport - Blowing/Dirfting Snow', 'Arrival Airport - Tornado / Funnel Cloud', 'Arrival Airport - High / Damaging Winds', 'Arrival Airport - Elevation (ft)', 'Arrivals Busyness Score', 'Proportion of filled seats', 'Arrival Airport - Avg Temp over 30C', 'Operator - Unique Carrier Code_9K', 'Operator - Unique Carrier Code_AA', 'Operator - Unique Carrier Code_AS', 'Operator - Unique Carrier Code_AX', 'Operator - Unique Carrier Code_B6', 'Operator - Unique Carrier Code_C5', 'Operator - Unique Carrier Code_CP', 'Operator - Unique Carrier Code_DL', 'Operator - Unique Carrier Code_F9', 'Operator - Unique Carrier Code_G4', 'Operator - Unique Carrier Code_HA', 'Operator - Unique Carrier Code_MQ', 'Operator - Unique Carrier Code_NK', 'Operator - Unique Carrier Code_OH', 'Operator - Unique Carrier Code_UA', 'Operator - Unique Carrier Code_VX', 'Operator - Unique Carrier Code_WN', 'Operator - Unique Carrier Code_YV', 'Operator - Unique Carrier Code_YX', 'Operator - Unique Carrier Code_ZW', 'serviceclass_G', 'serviceclass_L', 'Hour of departure_0', 'Hour of departure_1', 'Hour of departure_2', 'Hour of departure_3', 'Hour of departure_4', 'Hour of departure_5', 'Hour of departure_6', 'Hour of departure_7', 'Hour of departure_10', 'Hour of departure_11', 'Hour of departure_12', 'Hour of departure_13', 'Hour of departure_14', 'Hour of departure_15', 'Hour of departure_16', 'Hour of departure_17', 'Hour of departure_18', 'Hour of departure_19', 'Hour of departure_21', 'Hour of departure_23', 'Hour of arrival_0', 'Hour of arrival_2', 'Hour of arrival_4', 'Hour of arrival_5', 'Hour of arrival_6', 'Hour of arrival_7', 'Hour of arrival_9', 'Hour of arrival_10', 'Hour of arrival_11', 'Hour of arrival_12', 'Hour of arrival_14', 'Hour of arrival_15', 'Hour of arrival_16', 'Hour of arrival_17', 'Hour of arrival_18', 'Hour of arrival_19', 'Hour of arrival_20', 'Hour of arrival_21', 'Hour of arrival_22', 'Hour of arrival_23', 'type_arr_large_airport', 'type_arr_medium_airport', 'type_dep_medium_airport', 'type_dep_small_airport']]

getOLS(X,y)

                               OLS Regression Results                              
Dep. Variable:     Arrival Delay (minutes)   R-squared:                       0.051
Model:                                 OLS   Adj. R-squared:                  0.050
Method:                      Least Squares   F-statistic:                     82.98
Date:                     Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                             13:23:28   Log-Likelihood:            -6.6657e+05
No. Observations:                   149936   AIC:                         1.333e+06
Df Residuals:                       149838   BIC:                         1.334e+06
Df Model:                               97                                         
Covariance Type:                 nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------

In [45]:
### Values for Departure =======================================================================================================
y = flights_sample['Departure Delay (minutes)']
#X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  



##old try
#X = flights_sample[['Flight Weekday', 'Flight Day', 'Different Marketer & Operator Carrier Code', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Average Payload (lbs)', 'Available Seats (avg)', 'Aircraft Group', 'Aircraft Type', 'Aircraft Configuration', 'Departure Airport - Elevation (ft)', 'Departure Busyness Score', 'Proportion of freight to the payload', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Hour of departure', 'serviceclass_F', 'serviceclass_G', 'serviceclass_L', 'type_dep_medium_airport', 'type_dep_small_airport']]
#X = flights_sample[['Flight Weekday', 'Different Marketer & Operator Carrier Code', 'Distance Interval (x500 mi)', 'Scheduled Flight Time', 'Aircraft Group', 'Aircraft Type', 'Departure Busyness Score', 'Proportion of mail to the payload', 'Proportion of filled seats', 'Hour of departure', 'serviceclass_F', 'serviceclass_G', 'serviceclass_L', 'type_dep_medium_airport']]

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:29   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [46]:
### Values for Carrier =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Carrier Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:31   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [47]:
### Values for Weather =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Weather Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:33   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [48]:
### Values for National Air System =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['National Air System Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:35   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [49]:
### Values for Security =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Security Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:36   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [50]:
### Values for Late Aircraft =======================================================================================================
# X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)
# y = flights_sample['Late Aircraft Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:38   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [51]:
### Values for Arrival (with all the other delays as X) =======================================================================================================
# X = flights_sample['Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)']
# y = flights_sample['Arrival Delay (minutes)']

getOLS(X,y)

                                OLS Regression Results                               
Dep. Variable:     Departure Delay (minutes)   R-squared:                       0.050
Model:                                   OLS   Adj. R-squared:                  0.049
Method:                        Least Squares   F-statistic:                     80.93
Date:                       Thu, 01 Dec 2022   Prob (F-statistic):               0.00
Time:                               13:23:40   Log-Likelihood:            -6.3967e+05
No. Observations:                     149936   AIC:                         1.280e+06
Df Residuals:                         149838   BIC:                         1.281e+06
Df Model:                                 97                                         
Covariance Type:                   nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

## PCA

In [52]:
def runPCA (data, componentsNumber):
    # Import required librairies
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import scale

    # Set data to A
    A = data

    # Scale the data
    A = scale(A, with_std=True)  #Std scaler
    ## can also use a min/max scaler

    # Run a PCA over the data
    pca = PCA(n_components=componentsNumber) #change to the number of components you want to keep
    pca.fit(A)
    A_pca = pca.transform(A)

    return A_pca

In [53]:
X = flights_sample.drop(['Arrival Delay (minutes)', 'Departure Delay (minutes)', 'Carrier Delay (minutes)', 'Weather Delay (minutes)', 'National Air System Delay (minutes)', 'Security Delay (minutes)', 'Late Aircraft Delay (minutes)'], axis = 1)  

X_pca = runPCA(X, 0.95) #adapt to the level we want (depends of OLS?)

X_pca

array([[ 1.93880596, -0.9822855 , -0.43437029, ..., -0.03172289,
         4.04985247, -1.49347936],
       [ 1.64274599, -1.33996216,  2.1624808 , ..., -0.127034  ,
         0.30045762,  0.27729109],
       [ 0.32612314, -0.45819705,  2.3849544 , ...,  1.14755703,
        -0.31814281, -0.14716974],
       ...,
       [-0.28375089,  3.30072782, -0.03711485, ...,  0.30962126,
         0.21757796,  0.60226291],
       [-1.98642651,  0.68128285, -0.1667396 , ...,  0.22551142,
        -0.22374396, -0.32946884],
       [-1.56006844,  0.35950036,  0.71698232, ..., -0.52137584,
         0.39856498,  0.5690358 ]])

# ML Models

In [54]:
# !pip install xgboost

In [55]:
#import packages
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

### Predict Arrival Delays

In [56]:
# arrival delay - B_A
#Create variables and dfs
B_A_X = [3, 7, 11, 12, 13, 15, 26, 28, 30, 61, 77, 94, 135, 136]
B_A_y = 41

B_A_X_df = np.array(flights_sample.iloc[:,B_A_X])
B_A_y_df = np.array(flights_sample.iloc[:,B_A_y])

#### Arrival - Try Linear and Polynomial Regression Models

In [57]:
print("# of features used:", B_A_X_df.shape[1])

#Try Linear Regression
B_A_model = LinearRegression()
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model.predict(B_A_X_df)
print("linear regression score: ", B_A_model.score(B_A_X_df, B_A_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_A_X_df, B_A_y_df)
    y_pred = ridge.predict(B_A_X_df)
    scores.append(ridge.score(B_A_X_df, B_A_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_A_X_df, B_A_y_df)
    lasso_pred = lasso.predict(B_A_X_df)
    scores.append(lasso.score(B_A_X_df, B_A_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 2
B_A_model = PolynomialFeatures(degree=degree, include_bias=False)
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model.predict(B_A_X_df)
print("polynomial regression score: ", B_A_model.score(B_A_X_df, B_A_y_df), ", degree: ", degree)
B_A_model.score(B_A_X_df, B_A_y_df)

# of features used: 14
linear regression score:  0.03203470811760567
ridge regression scores:  [0.03203468719802316, 0.03203267754770933, 0.031880372197001794, 0.028994421341863474, 0.02473164555290519]
lasso regression scores:  [0.030483922955258258, 3.0300136875105466e-06, 0.0, 0.0, 0.0]


AttributeError: 'PolynomialFeatures' object has no attribute 'predict'

#### Arrival - Try XGBoost Model

In [None]:
B_A_model = xgb.XGBRegressor(objective='reg:linear', m_estimators=10, seed=123)
B_A_model.fit(B_A_X_df, B_A_y_df)
B_A_predictions = B_A_model(B_A_X_df)
B_A_model.score(B_A_X_df, B_A_y_df)

### Predict Departure Delays

In [None]:
#Departure Predictions
  
#B_D - departure -  create variables to create dataframes to use in model. 
B_D_X = [0, 1, 2, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 136, 137]
B_D_y = 42

B_D_X_df = np.array(flights_sample.iloc[:,B_D_X])
B_D_y_df = np.array(flights_sample.iloc[:,B_D_y])    

In [None]:
#Try Linear Regression
B_D_model = LinearRegression()
B_D_model.fit(B_D_X_df, B_D_y_df)
B_D_predictions = B_D_model.predict(B_D_X_df)
print("linear regression score: ", B_D_model.score(B_D_X_df, B_D_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_D_X_df, B_D_y_df)
    y_pred = ridge.predict(B_D_X_df)
    scores.append(ridge.score(B_D_X_df, B_D_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_D_X_df, B_D_y_df)
    lasso_pred = lasso.predict(B_D_X_df)
    scores.append(lasso.score(B_D_X_df, B_D_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 3
B_D_X_model = PolynomialFeatures(degree=degree, include_bias=False)
B_D_model.fit(B_D_X_df, B_D_y_df)
B_D_predictions = B_D_model.predict(B_D_X_df)
print("polynomial regression score: ", B_D_model.score(B_D_X_df, B_D_y_df), "degree: ", degree)

### Predict Other Delay Types

In [None]:
# carrier delay - B_C
#Create variables and dfs
B_C_X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137]
B_C_y = 43

B_C_X_df = flights_sample.iloc[:,B_C_X]
B_C_y_df = flights_sample.iloc[:,B_C_y]

In [None]:
print("# of features used:", B_C_X_df.shape[1])

#Try Linear Regression
B_C_model = LinearRegression()
B_C_model.fit(B_C_X_df, B_C_y_df)
B_C_predictions = B_C_model.predict(B_C_X_df)
print("linear regression score: ", B_C_model.score(B_C_X_df, B_C_y_df))

#Try Ridge Regression
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(B_C_X_df, B_C_y_df)
    y_pred = ridge.predict(B_C_X_df)
    scores.append(ridge.score(B_C_X_df, B_C_y_df))
print("ridge regression scores: ", scores)

#Try Lasso Regression
scores=[]
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(B_C_X_df, B_C_y_df)
    lasso_pred = lasso.predict(B_C_X_df)
    scores.append(lasso.score(B_C_X_df, B_C_y_df))
print("lasso regression scores: ", scores)

#Try Polynomial Regression
degree = 2
B_C_X_model = PolynomialFeatures(degree=degree, include_bias=False)
B_C_model.fit(B_C_X_df, B_C_y_df)
B_C_predictions = B_C_model.predict(B_C_X_df)
print("polynomial regression score: ", B_C_model.score(B_C_X_df, B_C_y_df), ", degree: ", degree)

In [None]:
# weather delay - B_W
B_W_X = [11, 12, 13, 15, 21, 30]
B_W_y = 44

B_W_X_df = flights_sample.iloc[:,B_W_X]
B_W_y_df = flights_sample.iloc[:,B_W_y]

In [None]:
# NAS delay - B_N
B_N_X = [4, 5, 11, 12, 13, 14, 15, 16, 18, 21, 26, 27, 28, 29, 30, 33, 35, 39, 59, 67, 70, 75, 77, 79, 94, 132, 133]
B_N_y = 45

B_N_X_df = flights_sample.iloc[:,B_N_X]
B_N_y_df = flights_sample.iloc[:,B_N_y]

In [None]:
B_L_X = [4, 5, 6, 7, 11, 12, 13, 14, 15, 25, 26, 27, 28, 30, 61, 77, 88, 89, 90, 91, 94, 96, 97, 99, 114, 117, 119, 120, 123, 124, 125, 126]
B_L_y = 47

B_L_X_df = flights_sample.iloc[:,B_L_X]
B_L_y_df = flights_sample.iloc[:,B_L_y]

# Export to CSV

In [None]:
# if trainingData == True:
#     flights_sample.to_csv('../../data/processed/flights_enriched.csv',index=False) #Training
# else:
#     flights_sample.to_csv('../../data/processed/flights_test_enriched.csv',index=False) #Testing