# Flight Arrival Delay

In [3]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from pathlib import Path

In [4]:
# Load the data
file_path = Path("Resources/Combined_Flights_2021.csv")
data = pd.read_csv(file_path)

In [5]:
# View data
data.head(10)

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0
5,2021-03-03,SkyWest Airlines Inc.,ORD,BNA,False,False,1650,1648.0,0.0,-2.0,...,1707.0,1804.0,4.0,1834,-26.0,0.0,-2.0,1800-1859,2,0.0
6,2021-03-03,SkyWest Airlines Inc.,PSP,PHX,False,False,1652,1651.0,0.0,-1.0,...,1739.0,1924.0,5.0,1902,27.0,1.0,1.0,1900-1959,2,0.0
7,2021-03-03,SkyWest Airlines Inc.,DFW,YUM,False,False,1245,1242.0,0.0,-3.0,...,1314.0,1447.0,5.0,1456,-4.0,0.0,-1.0,1400-1459,5,0.0
8,2021-03-03,SkyWest Airlines Inc.,LBB,PHX,False,False,726,717.0,0.0,-9.0,...,729.0,813.0,8.0,836,-15.0,0.0,-1.0,0800-0859,3,0.0
9,2021-03-03,SkyWest Airlines Inc.,DFW,DRO,False,False,2045,2040.0,0.0,-5.0,...,2105.0,2142.0,2.0,2215,-31.0,0.0,-2.0,2200-2259,3,0.0


In [6]:
data.shape

(6311871, 61)

In [7]:
data = data.dropna()

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6185869 entries, 0 to 6311870
Data columns (total 61 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   FlightDate                               object 
 1   Airline                                  object 
 2   Origin                                   object 
 3   Dest                                     object 
 4   Cancelled                                bool   
 5   Diverted                                 bool   
 6   CRSDepTime                               int64  
 7   DepTime                                  float64
 8   DepDelayMinutes                          float64
 9   DepDelay                                 float64
 10  ArrTime                                  float64
 11  ArrDelayMinutes                          float64
 12  AirTime                                  float64
 13  CRSElapsedTime                           float64
 14  ActualElapsedTime 

In [8]:
# Find total NaNs in dimensions
data.isnull().sum()

FlightDate            0
Airline               0
Origin                0
Dest                  0
Cancelled             0
                     ..
ArrDel15              0
ArrivalDelayGroups    0
ArrTimeBlk            0
DistanceGroup         0
DivAirportLandings    0
Length: 61, dtype: int64

In [9]:
data.isna().sum()

FlightDate            0
Airline               0
Origin                0
Dest                  0
Cancelled             0
                     ..
ArrDel15              0
ArrivalDelayGroups    0
ArrTimeBlk            0
DistanceGroup         0
DivAirportLandings    0
Length: 61, dtype: int64

In [10]:
ml_data = data[["Flight_Number_Marketing_Airline",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "DestAirportID",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrDelay", "ArrDel15"]]

In [11]:
ml_data.dtypes

Flight_Number_Marketing_Airline      int64
Cancelled                             bool
Diverted                              bool
CRSDepTime                           int64
DepTime                            float64
DepDelayMinutes                    float64
OriginAirportID                      int64
DestAirportID                        int64
TaxiOut                            float64
TaxiIn                             float64
CRSArrTime                           int64
ArrDelay                           float64
ArrDel15                           float64
dtype: object

In [13]:
# Encoding ArrDel15 dtype

ml_data = ml_data[(ml_data.ArrDel15 != '')]

In [14]:
ml_data["ArrDel15"] = ml_data["ArrDel15"].astype(int)

In [15]:
ml_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6185869 entries, 0 to 6311870
Data columns (total 13 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Flight_Number_Marketing_Airline  int64  
 1   Cancelled                        bool   
 2   Diverted                         bool   
 3   CRSDepTime                       int64  
 4   DepTime                          float64
 5   DepDelayMinutes                  float64
 6   OriginAirportID                  int64  
 7   DestAirportID                    int64  
 8   TaxiOut                          float64
 9   TaxiIn                           float64
 10  CRSArrTime                       int64  
 11  ArrDelay                         float64
 12  ArrDel15                         int64  
dtypes: bool(2), float64(5), int64(6)
memory usage: 578.1 MB


In [16]:
ml_data.head(3)

Unnamed: 0,Flight_Number_Marketing_Airline,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,OriginAirportID,DestAirportID,TaxiOut,TaxiIn,CRSArrTime,ArrDelay,ArrDel15
0,3133,False,False,724,714.0,0.0,14794,14107,10.0,5.0,843,-25.0,0
1,3134,False,False,922,917.0,0.0,14107,14794,23.0,3.0,1040,-9.0,0
2,3135,False,False,1330,1321.0,0.0,13296,13930,15.0,16.0,1530,-29.0,0


# Split, test and train data

In [17]:
# define the predictor variables and the response variable
X = data[[
    "Flight_Number_Marketing_Airline",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "DestAirportID",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrDelay"]]

y = data['ArrDel15']

X.head() 

Unnamed: 0,Flight_Number_Marketing_Airline,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,OriginAirportID,DestAirportID,TaxiOut,TaxiIn,CRSArrTime,ArrDelay
0,3133,False,False,724,714.0,0.0,14794,14107,10.0,5.0,843,-25.0
1,3134,False,False,922,917.0,0.0,14107,14794,23.0,3.0,1040,-9.0
2,3135,False,False,1330,1321.0,0.0,13296,13930,15.0,16.0,1530,-29.0
3,3136,False,False,1645,1636.0,0.0,11298,15323,27.0,7.0,2010,-8.0
4,3137,False,False,1844,1838.0,0.0,14107,10561,13.0,3.0,1925,-22.0


In [10]:
# Remove all infinite and NaN values

# def clean_dataset(X):
#     assert isinstance (X, pd.DataFrame), "df need to be a pd.DataFrame"
#     X.dropna(inplace=True)
#     indices_to_keep = ~X.isin([np.nan, np.inf, -np.inf]).any(1)
#     return X[indices_to_keep].astype(np.float64)

In [28]:
# Remove NaN from test
# y = y.fillna(0)

y.isnull().sum()

0

In [18]:
y.value_counts()

0.0    5117811
1.0    1068058
Name: ArrDel15, dtype: int64

# Fit and Train the Logistic Regression Model

In [19]:
#split the dataset into training (70%) and testing (30%) sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 


In [20]:
y_test.isnull().sum()

0

In [22]:
#instantiate the model
log_regression = LogisticRegression(max_iter=10000)

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [23]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Instantiate the model
ros = RandomOverSampler(random_state=1)
# Resample the targets
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 3582104, 1.0: 3582104})

In [25]:
#instantiate the model
log_regression = LogisticRegression(solver='lbfgs', random_state=1, max_iter=10000)

#fit the model using the training data
log_regression.fit(X_resampled,y_resampled)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score
# Calculated the balanced accuracy score
accuracy_score(y_test, y_pred)

0.9924559250894915

In [27]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual delay", "Actual ontime"], columns=["Predicted delay", "Predicted ontime"])
cm_df

Unnamed: 0,Predicted delay,Predicted ontime
Actual delay,1522982,12725
Actual ontime,1275,318779


In [28]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.99      1.00      1.00      0.99      0.99   1535707
        1.0       0.96      1.00      0.99      0.98      0.99      0.99    320054

avg / total       0.99      0.99      1.00      0.99      0.99      0.99   1855761

