# Two Stage Predictive Engine
A two stage predictive machine learning engine that forecasts the on-time
performance of flight from the given datasets 

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from sklearn.metrics import r2_score as R2, mean_absolute_error as MAE, mean_squared_error as  MSE

## Utilities

In [2]:
features_classification = ['OriginAirportID', 'DestAirportID',
            'DepTime', 'DepDelayMinutes', 'WindSpeedKmph_Origin', 
            'WindDirDegree_Origin', 'WeatherCode_Origin',
            'precipMM_Origin', 'Visibility_Origin', 'Pressure_Origin',
            'Cloudcover_Origin', 'DewPointF_Origin', 'WindGustKmph_Origin',
            'tempF_Origin', 'WindChillF_Origin', 'Humidity_Origin',
            'WindSpeedKmph_Dest', 'WindDirDegree_Dest', 'WeatherCode_Dest',
            'precipMM_Dest', 'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest',
            'DewPointF_Dest', 'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest',
            'Humidity_Dest']

target_classification = 'ArrDel15'

features_regression = ['OriginAirportID', 'DestAirportID', 'DepTime',
       'DepDelayMinutes', 'DepDel15', 'WindSpeedKmph_Origin',
       'WindDirDegree_Origin', 'WeatherCode_Origin', 'precipMM_Origin',
       'Visibility_Origin', 'Pressure_Origin', 'Cloudcover_Origin',
       'DewPointF_Origin', 'WindGustKmph_Origin', 'tempF_Origin',
       'WindChillF_Origin', 'Humidity_Origin', 'WindSpeedKmph_Dest',
       'WindDirDegree_Dest', 'WeatherCode_Dest', 'precipMM_Dest',
       'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest', 'DewPointF_Dest',
       'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest', 'Humidity_Dest']
       
target_regression = 'ArrDelayMinutes'

## Dataset

In [3]:
originalDF = pd.read_csv("drive/MyDrive/Data/dataset.csv")
originalDF = originalDF.astype({"ArrDel15": np.int64})

## Pipeline

### Classification

In [4]:
X = originalDF[features_classification]
y = originalDF[target_classification]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Accuracy Score: ", accuracy_score(y_test, predictions))
print("Balanced Accuracy Score: ", balanced_accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95    292831
           1       0.88      0.70      0.78     77457

    accuracy                           0.92    370288
   macro avg       0.90      0.84      0.87    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9177829149202783
Balanced Accuracy Score:  0.8378800741363226


In [6]:
originalDF["ArrDel15_Pred"] = model.predict(originalDF[features_classification])
delayedDF = originalDF[originalDF["ArrDel15_Pred"] == 1]
# delayedDF = originalDF.loc["ArrDel15_Pred" == 1].copy()

## Regression

In [7]:
originalDF = originalDF[originalDF["ArrDel15"] == 1]

X_train = originalDF[features_regression] 
X_test = delayedDF[features_regression]
y_train = originalDF[target_regression]
y_test = delayedDF[target_regression]

In [8]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("R2:", R2(y_test, predictions))
print("MAE:", MAE(y_test, predictions))
print("RMSE:", np.sqrt(MSE(y_test, predictions)))

R2: 0.990657134088464
MAE: 4.743965092474668
RMSE: 7.074138064438183


In [9]:
delayedDF["ArrDelMinutes_Pred"] = predictions
delayedDF.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelayMinutes,DepDel15,CRSArrTime,ArrTime,ArrDelayMinutes,ArrDel15,DepHour,ArrHour,WindSpeedKmph_Origin,WindDirDegree_Origin,WeatherCode_Origin,precipMM_Origin,Visibility_Origin,Pressure_Origin,Cloudcover_Origin,DewPointF_Origin,WindGustKmph_Origin,tempF_Origin,WindChillF_Origin,Humidity_Origin,WindSpeedKmph_Dest,WindDirDegree_Dest,WeatherCode_Dest,precipMM_Dest,Visibility_Dest,Pressure_Dest,Cloudcover_Dest,DewPointF_Dest,WindGustKmph_Dest,tempF_Dest,WindChillF_Dest,Humidity_Dest,ArrDel15_Pred,ArrDelMinutes_Pred
1,2016,1,1,1,2016-01-01,13204,MCO,12478,JFK,1330,1408,38.0,1.0,1603,1628,25.0,1,1400,1600,8,227,122,0.1,8,1019,79,72,10,79,78,83,21,254,113,0.0,10,1016,0,33,24,42,33,74,1,29.02
11,2016,1,1,1,2016-01-01,14747,SEA,14771,SFO,724,759,35.0,1.0,946,1010,24.0,1,700,1000,7,106,113,0.0,10,1030,0,24,8,34,29,67,19,72,113,0.0,10,1023,0,25,30,35,25,70,1,26.99
13,2016,1,1,1,2016-01-01,13303,MIA,14771,SFO,700,737,37.0,1.0,1021,1047,26.0,1,700,1000,9,109,266,0.2,2,1019,100,70,17,72,72,96,19,72,113,0.0,10,1023,0,25,30,35,25,70,1,29.92
17,2016,1,1,1,2016-01-01,14107,PHX,14771,SFO,839,911,32.0,1.0,950,1018,28.0,1,900,1000,7,48,113,0.0,10,1021,0,31,14,42,38,69,19,72,113,0.0,10,1023,0,25,30,35,25,70,1,28.87
19,2016,1,1,1,2016-01-01,13930,ORD,14771,SFO,653,709,16.0,1.0,941,1015,34.0,1,700,1000,21,256,122,0.0,10,1024,66,15,32,20,6,86,19,72,113,0.0,10,1023,0,25,30,35,25,70,1,31.21


In [10]:
delayedDF.to_csv("drive/MyDrive/Data/temp.csv")