# Flight Arrival Delay

In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# Load the data
file_path = Path("Resources/Combined_Flights_2021.csv")
data = pd.read_csv(file_path)

In [3]:
# View data
data.head(10)

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0
5,2021-03-03,SkyWest Airlines Inc.,ORD,BNA,False,False,1650,1648.0,0.0,-2.0,...,1707.0,1804.0,4.0,1834,-26.0,0.0,-2.0,1800-1859,2,0.0
6,2021-03-03,SkyWest Airlines Inc.,PSP,PHX,False,False,1652,1651.0,0.0,-1.0,...,1739.0,1924.0,5.0,1902,27.0,1.0,1.0,1900-1959,2,0.0
7,2021-03-03,SkyWest Airlines Inc.,DFW,YUM,False,False,1245,1242.0,0.0,-3.0,...,1314.0,1447.0,5.0,1456,-4.0,0.0,-1.0,1400-1459,5,0.0
8,2021-03-03,SkyWest Airlines Inc.,LBB,PHX,False,False,726,717.0,0.0,-9.0,...,729.0,813.0,8.0,836,-15.0,0.0,-1.0,0800-0859,3,0.0
9,2021-03-03,SkyWest Airlines Inc.,DFW,DRO,False,False,2045,2040.0,0.0,-5.0,...,2105.0,2142.0,2.0,2215,-31.0,0.0,-2.0,2200-2259,3,0.0


In [4]:
data.shape

(6311871, 61)

In [5]:
# Find total NaNs in dimensions
data.isnull().sum()

FlightDate                 0
Airline                    0
Origin                     0
Dest                       0
Cancelled                  0
                       ...  
ArrDel15              126001
ArrivalDelayGroups    126001
ArrTimeBlk                 0
DistanceGroup              0
DivAirportLandings         2
Length: 61, dtype: int64

In [6]:
# Remove NaN from test
data = data.fillna('')

data.isnull().sum()



FlightDate            0
Airline               0
Origin                0
Dest                  0
Cancelled             0
                     ..
ArrDel15              0
ArrivalDelayGroups    0
ArrTimeBlk            0
DistanceGroup         0
DivAirportLandings    0
Length: 61, dtype: int64

# Split, test and train data

In [7]:
# define the predictor variables and the response variable
X = data[[
    "Flight_Number_Marketing_Airline",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "DestAirportID",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrDelay"]]

y = data['ArrDel15']

X.head() 

Unnamed: 0,Flight_Number_Marketing_Airline,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,OriginAirportID,DestAirportID,TaxiOut,TaxiIn,CRSArrTime,ArrDelay
0,3133,False,False,724,714.0,0.0,14794,14107,10.0,5.0,843,-25.0
1,3134,False,False,922,917.0,0.0,14107,14794,23.0,3.0,1040,-9.0
2,3135,False,False,1330,1321.0,0.0,13296,13930,15.0,16.0,1530,-29.0
3,3136,False,False,1645,1636.0,0.0,11298,15323,27.0,7.0,2010,-8.0
4,3137,False,False,1844,1838.0,0.0,14107,10561,13.0,3.0,1925,-22.0


In [8]:
# Find NaNs
X.isnull().sum()

Flight_Number_Marketing_Airline    0
Cancelled                          0
Diverted                           0
CRSDepTime                         0
DepTime                            0
DepDelayMinutes                    0
OriginAirportID                    0
DestAirportID                      0
TaxiOut                            0
TaxiIn                             0
CRSArrTime                         0
ArrDelay                           0
dtype: int64

In [14]:
# Remove NaN from test
X = X.fillna(0)

X.isnull().sum()

Flight_Number_Marketing_Airline    0
Cancelled                          0
Diverted                           0
CRSDepTime                         0
DepTime                            0
DepDelayMinutes                    0
OriginAirportID                    0
DestAirportID                      0
TaxiOut                            0
TaxiIn                             0
CRSArrTime                         0
ArrDelay                           0
dtype: int64

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6311871 entries, 0 to 6311870
Data columns (total 12 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Flight_Number_Marketing_Airline  int64  
 1   Cancelled                        bool   
 2   Diverted                         bool   
 3   CRSDepTime                       int64  
 4   DepTime                          float64
 5   DepDelayMinutes                  float64
 6   OriginAirportID                  int64  
 7   DestAirportID                    int64  
 8   TaxiOut                          float64
 9   TaxiIn                           float64
 10  CRSArrTime                       int64  
 11  ArrDelay                         float64
dtypes: bool(2), float64(5), int64(5)
memory usage: 493.6 MB


# Convert dtypes of testing dimensions

- We will need to convert the following dimensions `to_numeric` train the Logistic Regression model: "Flight_Number_Marketing_Airline",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "DestAirportID",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrDelay"


In [16]:
# Change all object dimensions to float

X = X.apply(pd.to_numeric)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6311871 entries, 0 to 6311870
Data columns (total 12 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Flight_Number_Marketing_Airline  int64  
 1   Cancelled                        bool   
 2   Diverted                         bool   
 3   CRSDepTime                       int64  
 4   DepTime                          float64
 5   DepDelayMinutes                  float64
 6   OriginAirportID                  int64  
 7   DestAirportID                    int64  
 8   TaxiOut                          float64
 9   TaxiIn                           float64
 10  CRSArrTime                       int64  
 11  ArrDelay                         float64
dtypes: bool(2), float64(5), int64(5)
memory usage: 493.6 MB


# Fit and Train the Logistic Regression Model

In [17]:
#split the dataset into training (70%) and testing (30%) sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 

In [18]:
#instantiate the model
log_regression = LogisticRegression()

#fit the model using the training data
log_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = log_regression.predict(X_test)

ValueError: Unknown label type: 'unknown'