## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from sklearn.metrics import r2_score as R2, mean_absolute_error as MAE, mean_squared_error as  MSE

## Utilities

In [2]:
features_classification = ['OriginAirportID', 'DestAirportID',
            'DepTime', 'DepDelayMinutes', 'WindSpeedKmph_Origin', 
            'WindDirDegree_Origin', 'WeatherCode_Origin',
            'precipMM_Origin', 'Visibility_Origin', 'Pressure_Origin',
            'Cloudcover_Origin', 'DewPointF_Origin', 'WindGustKmph_Origin',
            'tempF_Origin', 'WindChillF_Origin', 'Humidity_Origin',
            'WindSpeedKmph_Dest', 'WindDirDegree_Dest', 'WeatherCode_Dest',
            'precipMM_Dest', 'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest',
            'DewPointF_Dest', 'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest',
            'Humidity_Dest']

target_classification = 'ArrDel15'

features_regression = ['OriginAirportID', 'DestAirportID', 'DepTime',
       'DepDelayMinutes', 'DepDel15', 'WindSpeedKmph_Origin',
       'WindDirDegree_Origin', 'WeatherCode_Origin', 'precipMM_Origin',
       'Visibility_Origin', 'Pressure_Origin', 'Cloudcover_Origin',
       'DewPointF_Origin', 'WindGustKmph_Origin', 'tempF_Origin',
       'WindChillF_Origin', 'Humidity_Origin', 'WindSpeedKmph_Dest',
       'WindDirDegree_Dest', 'WeatherCode_Dest', 'precipMM_Dest',
       'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest', 'DewPointF_Dest',
       'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest', 'Humidity_Dest']
       
target_regression = 'ArrDelayMinutes'

## Dataset

In [3]:
flightDF = pd.read_csv("drive/MyDrive/Data/dataset.csv")
flightDF = flightDF.astype({"ArrDel15": np.int64})
flightDF.shape

(1851436, 43)

In [4]:
trainData, testData = train_test_split(flightDF, test_size=0.2)

## Pipeline

### Classification

In [5]:
model = RandomForestClassifier()
model.fit(trainData[features_classification], trainData[target_classification])
testData["ArrDel15_pred"] = model.predict(testData[features_classification])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
print(classification_report(testData[target_classification], testData["ArrDel15_pred"]))
print("Accuracy Score: ", accuracy_score(testData[target_classification], testData["ArrDel15_pred"]))
print("Balanced Accuracy Score: ", balanced_accuracy_score(testData[target_classification], testData["ArrDel15_pred"]))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95    292671
           1       0.88      0.70      0.78     77617

    accuracy                           0.92    370288
   macro avg       0.90      0.84      0.87    370288
weighted avg       0.92      0.92      0.91    370288

Accuracy Score:  0.9181312923994296
Balanced Accuracy Score:  0.8397703617222656


### Extracting Delayed Flights

In [8]:
trainData = trainData[trainData["ArrDel15"]==1]
testData = testData[testData["ArrDel15_pred"]==1]

## Regression

In [9]:
model = RandomForestRegressor()
model.fit(trainData[features_regression], trainData[target_regression])
testData["ArrDelayMinutes_pred"] = model.predict(testData[features_regression])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
print("R2:", R2(testData[target_regression], testData["ArrDelayMinutes_pred"]))
print("MAE:", MAE(testData[target_regression], testData["ArrDelayMinutes_pred"]))
print("RMSE:", np.sqrt(MSE(testData[target_regression], testData["ArrDelayMinutes_pred"])))

R2: 0.9469473363514695
MAE: 13.738582369667448
RMSE: 18.64457262463949
