In [1]:
# for data handling
import pandas as pd
import numpy as np

# train-test split
from sklearn.model_selection import train_test_split

# loss functions for today
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# stuff for evaluating classifiers
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt # for displaying a pretty confusion matrix

# dummy models for comparison
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Due to an error in the dataset the airport identifiers from the entirety of October are compromised of intergers instead of the airport ID. This is quite troublesome to fix which is why we've decided to remove the rows from October even though this will defininetly decrease the accuracy of our model.

In [2]:
flight_df = pd.read_csv('flights.csv', dtype={'DESTINATION_AIRPORT': str, 'ORIGIN_AIRPORT': str})

# Exploratory

In [10]:
flight_df.shape

(5819079, 31)

In [12]:
flight_df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [13]:
flight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5819079 entries, 0 to 5819078
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   YEAR                 int64  
 1   MONTH                int64  
 2   DAY                  int64  
 3   DAY_OF_WEEK          int64  
 4   AIRLINE              object 
 5   FLIGHT_NUMBER        int64  
 6   TAIL_NUMBER          object 
 7   ORIGIN_AIRPORT       object 
 8   DESTINATION_AIRPORT  object 
 9   SCHEDULED_DEPARTURE  int64  
 10  DEPARTURE_TIME       float64
 11  DEPARTURE_DELAY      float64
 12  TAXI_OUT             float64
 13  WHEELS_OFF           float64
 14  SCHEDULED_TIME       float64
 15  ELAPSED_TIME         float64
 16  AIR_TIME             float64
 17  DISTANCE             int64  
 18  WHEELS_ON            float64
 19  TAXI_IN              float64
 20  SCHEDULED_ARRIVAL    int64  
 21  ARRIVAL_TIME         float64
 22  ARRIVAL_DELAY        float64
 23  DIVERTED             int64  
 24

In [15]:
flight_df['ARRIVAL_DELAY'].max()

1971.0

Potential outlier - remove?

# Preprocessing

According to the FAA a flight is considered delayed it is delayed by 15 minutes or more.

In [3]:
df_fd = flight_df.copy()
df_fd = df_fd[df_fd.ARRIVAL_DELAY > 15]

We'll clear out some irrelevant columns and retain those we need for our prediction analysis.

In [4]:
# We will not consider cancelled or diverted flights as we are only concerned with delays
df_fd = df_fd[df_fd.CANCELLED != 1]
df_fd = df_fd[df_fd.DIVERTED != 1]

# Drop all columns that do not add interpretability to the model
df_fd = df_fd.drop(['YEAR', 'DEPARTURE_DELAY', 'DEPARTURE_TIME', 'TAXI_OUT','WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME',
                  'WHEELS_ON','TAXI_IN','ARRIVAL_TIME','DIVERTED','CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
                   'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'], axis=1)

# Drop TAIL_NUMBER for now due to value error
df_fd = df_fd.drop(['TAIL_NUMBER'], axis=1)
    
# Fill NaNs - just to make sure
df_fd = df_fd.fillna(0)

df_fd.head(30)

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY
27,1,1,4,NK,597,MSP,FLL,115,207.0,1487,542,25.0
30,1,1,4,NK,168,PHX,ORD,125,204.0,1440,549,43.0
50,1,1,4,B6,1030,BQN,MCO,307,173.0,1129,500,20.0
52,1,1,4,B6,2134,SJU,MCO,400,185.0,1189,605,85.0
55,1,1,4,B6,2276,SJU,BDL,438,241.0,1666,739,89.0
70,1,1,4,AA,1057,DFW,MIA,515,161.0,1121,856,102.0
73,1,1,4,US,425,PDX,PHX,520,150.0,1009,850,60.0
74,1,1,4,AA,89,IAH,MIA,520,141.0,964,841,54.0
86,1,1,4,AA,328,DEN,DFW,530,125.0,641,835,66.0
92,1,1,4,UA,1532,SFO,DEN,531,146.0,967,857,26.0


Encoding

In [5]:
def encode_and_bind_all(original_dataframe, features_to_encode):
    dummies = pd.get_dummies(original_dataframe[features_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(features_to_encode, axis=1)
    return res

features_to_encode = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']
df_fd_enc = encode_and_bind_all(df_fd, features_to_encode)
df_fd_enc.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,AIRLINE_AA,...,DESTINATION_AIRPORT_TYS,DESTINATION_AIRPORT_UST,DESTINATION_AIRPORT_VEL,DESTINATION_AIRPORT_VLD,DESTINATION_AIRPORT_VPS,DESTINATION_AIRPORT_WRG,DESTINATION_AIRPORT_WYS,DESTINATION_AIRPORT_XNA,DESTINATION_AIRPORT_YAK,DESTINATION_AIRPORT_YUM
27,1,1,4,597,115,207.0,1487,542,25.0,0,...,0,0,0,0,0,0,0,0,0,0
30,1,1,4,168,125,204.0,1440,549,43.0,0,...,0,0,0,0,0,0,0,0,0,0
50,1,1,4,1030,307,173.0,1129,500,20.0,0,...,0,0,0,0,0,0,0,0,0,0
52,1,1,4,2134,400,185.0,1189,605,85.0,0,...,0,0,0,0,0,0,0,0,0,0
55,1,1,4,2276,438,241.0,1666,739,89.0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_fd_enc.shape

(1023498, 1271)

Train-Test split

In [9]:
y = df_fd_enc['ARRIVAL_DELAY']
X = df_fd_enc.drop('ARRIVAL_DELAY', axis=1)

#df_fd_enc[['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
#          'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL']]

Creating training and test size

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Regression

Code should be working, however requires a lot of memory unless we downsize the amount of rows.

In [12]:
fd_ln = LinearRegression()

fd_ln.fit(X_train, y_train)

MemoryError: Unable to allocate 6.78 GiB for an array with shape (1270, 716448) and data type float64