# Predict flight cancellation probabilities and potential delay time given flight details

## Data Preprocessing

In [70]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pickle

In [74]:
df = pd.read_csv('data/data-coordinates.csv', dtype={'CANCELLATION_CODE': 'string'})

In [75]:
df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 
        'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED']]

In [76]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED
0,1,17,7,DL,1114,BOS,TPA,1547,1907,1927.0,20.0,0.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,2132.0,267.0,0.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,1735.0,46.0,0.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,2212.0,19.0,0.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,1136.0,32.0,0.0


In [77]:
df.dtypes

MONTH                  int64
DAY_OF_MONTH           int64
DAY_OF_WEEK            int64
OP_UNIQUE_CARRIER     object
OP_CARRIER_FL_NUM      int64
ORIGIN                object
DEST                  object
CRS_DEP_TIME           int64
CRS_ARR_TIME           int64
ARR_TIME             float64
ARR_DELAY            float64
CANCELLED            float64
dtype: object

In [111]:
df.columns.values

array(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
       'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED'], dtype=object)

In [79]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED
0,1,17,7,DL,1114,BOS,TPA,1547,1907,1927.0,20.0,0.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,2132.0,267.0,0.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,1735.0,46.0,0.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,2212.0,19.0,0.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,1136.0,32.0,0.0


In [80]:
df['ARR_DELAY'] = df['ARR_DELAY'].fillna(0)
df = df.drop(["ARR_TIME"], axis=1)

In [81]:
delay_X = df.drop(["ARR_DELAY", "CANCELLED"], axis=1)
delay_Y = df["ARR_DELAY"].copy()

cancel_X = delay_X.copy()
cancel_Y = df["CANCELLED"].copy()

In [82]:
num_features = list(delay_X.drop(["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"], axis=1))
cat_features = ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]

full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(), cat_features)
])

In [83]:
delay_X = full_pipeline.fit_transform(delay_X)
cancel_X = full_pipeline.fit_transform(cancel_X)

In [84]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y = train_test_split(delay_X, delay_Y, test_size=0.1)
cancel_train_X, cancel_test_X, cancel_train_Y, cancel_test_Y = train_test_split(cancel_X, cancel_Y, test_size=0.1)

### Save Transformer

In [85]:
filename = 'saved_models/data_transformer.sav'
pickle.dump(full_pipeline, open(filename, 'wb'))

In [86]:
loaded_model = pickle.load(open(filename, 'rb'))

## Train Model: Delay

### Linear Regression

In [87]:
from sklearn.linear_model import LinearRegression

In [88]:
lin_reg = LinearRegression()
lin_reg.fit(delay_train_X, delay_train_Y)

In [89]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

delay_test_predictions = lin_reg.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 90.9866427719302, Mean absolute error: 47.57735608101151


### Retrain on all data

In [90]:
lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_Y)

filename = 'saved_models/delay_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [91]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 90.9019814081227, Mean absolute error: 47.538059814494936


### Neural Network

In [17]:
import math
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [18]:
neural = Sequential()

neural.add(Dense(128, kernel_initializer='normal',input_dim = delay_train_X.shape[1], activation='relu'))
neural.add(Dense(256, kernel_initializer='normal',activation='relu'))
neural.add(Dropout(0.2))
neural.add(Dense(64, kernel_initializer='normal',activation='relu'))
neural.add(Dense(1, kernel_initializer='normal',activation='linear'))

neural.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
neural.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               84096     
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 133,633
Trainable params: 133,633
Non-trainable params: 0
_________________________________________________________________


In [19]:
neural.fit(delay_train_X.toarray() , np.array(delay_train_Y) , epochs=5, batch_size=32, validation_split = 0.2)

Epoch 1/5
    1/25689 [..............................] - ETA: 1:14:10 - loss: 27026.5723 - mean_squared_error: 27026.5723

2022-11-19 19:02:02.589482: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b6e6f5b0>

In [20]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

delay_test_predictions = neural.predict(delay_test_X)
neural_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
neural_rmse = np.sqrt(neural_mse)
neural_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {neural_rmse}, Mean absolute error: {neural_mae}')

Root mean squared error: 90.47897165533874, Mean absolute error: 48.10639621789416


## Train Model: Cancel

### Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression

In [93]:
clf = LogisticRegression(max_iter=1000)
clf.fit(cancel_train_X, cancel_train_Y)

In [94]:
import numpy as np
from sklearn.metrics import accuracy_score

cancel_test_predictions = clf.predict(cancel_test_X)
acc = accuracy_score(cancel_test_Y, cancel_test_predictions)

print(f'Accuracy: {acc}')

Accuracy: 0.9045633704125426


In [95]:
clf.predict_proba(cancel_test_X)

array([[0.93484713, 0.06515287],
       [0.88863187, 0.11136813],
       [0.90314106, 0.09685894],
       ...,
       [0.97177281, 0.02822719],
       [0.91793267, 0.08206733],
       [0.79885784, 0.20114216]])

### Retrain on all data

In [96]:
clf = LogisticRegression(max_iter=1000)
clf.fit(cancel_X, cancel_Y)

filename = 'saved_models/cancel_log_reg.sav'
pickle.dump(clf, open(filename, 'wb'))

In [97]:
loaded_model = pickle.load(open(filename, 'rb'))

cancel_test_predictions = loaded_model.predict(cancel_test_X)
acc = accuracy_score(cancel_test_Y, cancel_test_predictions)

print(f'Accuracy: {acc}')

Accuracy: 0.9045808881492511


In [98]:
loaded_model.predict_proba(cancel_test_X)[0][1]

0.0648070440712828