In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("data/Flight_delay.csv")
df.head()

Unnamed: 0,DayOfWeek,Date,DepTime,ArrTime,CRSArrTime,UniqueCarrier,Airline,FlightNum,TailNum,ActualElapsedTime,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,4,03-01-2019,1829,1959,1925,WN,Southwest Airlines Co.,3920,N464WN,90,...,3,10,0,N,0,2,0,0,0,32
1,4,03-01-2019,1937,2037,1940,WN,Southwest Airlines Co.,509,N763SW,240,...,3,7,0,N,0,10,0,0,0,47
2,4,03-01-2019,1644,1845,1725,WN,Southwest Airlines Co.,1333,N334SW,121,...,6,8,0,N,0,8,0,0,0,72
3,4,03-01-2019,1452,1640,1625,WN,Southwest Airlines Co.,675,N286WN,228,...,7,8,0,N,0,3,0,0,0,12
4,4,03-01-2019,1323,1526,1510,WN,Southwest Airlines Co.,4,N674AA,123,...,4,9,0,N,0,0,0,0,0,16


In [3]:
#gets only the needed columns
df=df[['DayOfWeek','Date','CRSArrTime','Airline','CRSElapsedTime','Origin','Dest','Distance','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']]
df.dtypes

DayOfWeek             int64
Date                 object
CRSArrTime            int64
Airline              object
CRSElapsedTime        int64
Origin               object
Dest                 object
Distance              int64
CarrierDelay          int64
WeatherDelay          int64
NASDelay              int64
SecurityDelay         int64
LateAircraftDelay     int64
dtype: object

In [4]:
#combine all delay columns into one DelayTime column
df['DelayTime']=df['CarrierDelay']+df['WeatherDelay']+df['NASDelay']+df['SecurityDelay']+df['LateAircraftDelay']
df['DelayTime']

0         34
1         57
2         80
3         15
4         16
          ..
484546    27
484547    39
484548    47
484549    26
484550    18
Name: DelayTime, Length: 484551, dtype: int64

In [5]:
#drop extraneous delay columns
df=df.drop(['CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'],axis=1)
df.head()

Unnamed: 0,DayOfWeek,Date,CRSArrTime,Airline,CRSElapsedTime,Origin,Dest,Distance,DelayTime
0,4,03-01-2019,1925,Southwest Airlines Co.,90,IND,BWI,515,34
1,4,03-01-2019,1940,Southwest Airlines Co.,250,IND,LAS,1591,57
2,4,03-01-2019,1725,Southwest Airlines Co.,135,IND,MCO,828,80
3,4,03-01-2019,1625,Southwest Airlines Co.,240,IND,PHX,1489,15
4,4,03-01-2019,1510,Southwest Airlines Co.,135,IND,TPA,838,16


In [6]:
#Convert hhmm columns into total mins
df[['CRSArrTime']]=df[['CRSArrTime']].apply(lambda x: x//100*60+x%100)
df['CRSArrTime']


0         1165
1         1180
2         1045
3          985
4          910
          ... 
484546    1155
484547    1155
484548    1155
484549    1155
484550    1155
Name: CRSArrTime, Length: 484551, dtype: int64

In [7]:
from datetime import datetime
epoch = datetime(1970, 1, 1)
df['Date']=df['Date'].apply(lambda x: (datetime.strptime(x, '%d-%m-%Y')-epoch).days)
df['Date']

0         17899
1         17899
2         17899
3         17899
4         17899
          ...  
484546    18060
484547    18061
484548    18064
484549    18069
484550    18070
Name: Date, Length: 484551, dtype: int64

In [8]:
from sklearn.preprocessing import OrdinalEncoder

airport_enc = OrdinalEncoder()
airline_enc = OrdinalEncoder()
airport_cols=['Origin','Dest']
df[airport_cols]=airport_enc.fit_transform(df[airport_cols])
df[['Airline']]=airline_enc.fit_transform(df[['Airline']])

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm

class DelayNetwork(nn.Module):
    def __init__(self,input_size):
        super().__init__()

        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size,128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Linear(32,16),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Linear(16,1)
        )

    def forward(self,x):
        x = self.flatten(x)
        x = self.linear_relu_stack(x)

        return x

class DelayDataset(torch.utils.data.Dataset):
    def __init__(self,data):
        self.features=data.drop(['DelayTime'],axis=1)
        self.labels=data['DelayTime']
        #print(f"features: {self.features.shape}\nlabels: {self.labels.shape}\n")

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self,idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        feature_items=self.features.iloc[[idx]].to_numpy()
        label_items=self.labels.iloc[[idx]].to_numpy()

        x=torch.tensor(feature_items,dtype=torch.float32)
        y=torch.tensor(label_items,dtype=torch.float32)       

        return x,y

def train_loop(model,dataloader,optimizer,epoch):
    model.train()
    for batch in tqdm(dataloader, desc=f"Epoch {epoch}: "):
        inputs, labels = batch
        labels=torch.flatten(labels)
        pred = model(inputs)
        
        loss=F.l1_loss(torch.ravel(pred), labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def test_loop(model,dataloader):
    model.eval()
    size = len(dataloader)
    test_loss=0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Testing: "):
            inputs,labels=batch
            labels=torch.flatten(labels)
            pred=model(inputs)

            test_loss+=F.l1_loss(torch.ravel(pred),labels).item()
            
    test_loss=test_loss / size
    print(f'Test loss: {test_loss}')



In [10]:
from sklearn.model_selection import train_test_split

model=DelayNetwork(df.shape[1]-1)
train_df,test_df= train_test_split(df,test_size=.2,random_state=42)
print(f"Training size: {train_df.shape[0]}\nTesting size: {test_df.shape[0]}")

Training size: 387640
Testing size: 96911


In [11]:
from torch.utils.data import DataLoader

batch_size=32
epochs=10
train_dataset,test_dataset=DelayDataset(train_df),DelayDataset(test_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True,drop_last=True)

In [12]:
import torch.optim as optim

lr=.005
optimizer=optim.Adam(model.parameters(),lr=lr)

  _torch_pytree._register_pytree_node(


In [13]:
for epoch in range(epochs):
    print("==========================================")
    train_loop(model=model,dataloader=train_dataloader,optimizer=optimizer,epoch=epoch+1)
    test_loop(model=model,dataloader=test_dataloader)

Epoch 1: 100%|██████████| 12113/12113 [02:54<00:00, 69.27it/s]
Testing: 100%|██████████| 3028/3028 [00:33<00:00, 89.41it/s]


Test loss: 34.301689318023236


Epoch 2: 100%|██████████| 12113/12113 [02:54<00:00, 69.38it/s]
Testing: 100%|██████████| 3028/3028 [00:33<00:00, 90.29it/s]


Test loss: 34.19482698339767


Epoch 3: 100%|██████████| 12113/12113 [03:04<00:00, 65.57it/s]
Testing: 100%|██████████| 3028/3028 [00:34<00:00, 87.45it/s]


Test loss: 34.26018377465333


Epoch 4: 100%|██████████| 12113/12113 [03:04<00:00, 65.81it/s]
Testing: 100%|██████████| 3028/3028 [00:34<00:00, 87.31it/s] 


Test loss: 34.08999029270409


Epoch 5: 100%|██████████| 12113/12113 [02:59<00:00, 67.55it/s]
Testing: 100%|██████████| 3028/3028 [00:31<00:00, 95.02it/s] 


Test loss: 34.16665442018245


Epoch 6: 100%|██████████| 12113/12113 [02:48<00:00, 72.10it/s]
Testing: 100%|██████████| 3028/3028 [00:33<00:00, 89.93it/s]


Test loss: 34.05214285189126


Epoch 7: 100%|██████████| 12113/12113 [02:49<00:00, 71.35it/s]
Testing: 100%|██████████| 3028/3028 [00:33<00:00, 89.13it/s]


Test loss: 34.0711510175128


Epoch 8: 100%|██████████| 12113/12113 [02:49<00:00, 71.40it/s]
Testing: 100%|██████████| 3028/3028 [00:34<00:00, 88.90it/s]


Test loss: 34.011102935282054


Epoch 9: 100%|██████████| 12113/12113 [02:52<00:00, 70.24it/s]
Testing: 100%|██████████| 3028/3028 [00:32<00:00, 92.09it/s]


Test loss: 33.98875518960084


Epoch 10: 100%|██████████| 12113/12113 [02:48<00:00, 71.80it/s]
Testing: 100%|██████████| 3028/3028 [00:32<00:00, 92.63it/s]

Test loss: 34.00089927110206





The model can predict the delay time to within about 30 minutes