# Regression Analysis

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as R2, mean_absolute_error as MAE, mean_squared_error as  MSE
import matplotlib.pyplot as plt

## Utilities

In [2]:
features = ['OriginAirportID', 'DestAirportID', 'DepTime',
       'DepDelayMinutes', 'DepDel15', 'WindSpeedKmph_Origin',
       'WindDirDegree_Origin', 'WeatherCode_Origin', 'precipMM_Origin',
       'Visibility_Origin', 'Pressure_Origin', 'Cloudcover_Origin',
       'DewPointF_Origin', 'WindGustKmph_Origin', 'tempF_Origin',
       'WindChillF_Origin', 'Humidity_Origin', 'WindSpeedKmph_Dest',
       'WindDirDegree_Dest', 'WeatherCode_Dest', 'precipMM_Dest',
       'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest', 'DewPointF_Dest',
       'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest', 'Humidity_Dest']
       
target = 'ArrDelayMinutes'

limits = [[0, 100], [100, 200], [200, 500], [500, 1000], [1000, 2000]]

In [3]:
def filterDF(df, lower, upper):
  print([lower, upper])
  return df[(df[target]>=lower) & (df[target]<upper)]

def test(model, X_test, y_test):
  print(X_test.shape)
  predictions = model.predict(X_test)

  print("MAE:", MAE(y_test, predictions))
  print("RMSE:", np.sqrt(MSE(y_test, predictions)))
  print("\n")

## Dataset

In [4]:
df = pd.read_csv("drive/MyDrive/Data/dataset.csv")
df = df[df["ArrDel15"] == 1]
print(df.shape)
df.head() 

(388058, 43)


Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelayMinutes,DepDel15,CRSArrTime,ArrTime,ArrDelayMinutes,ArrDel15,DepHour,ArrHour,WindSpeedKmph_Origin,WindDirDegree_Origin,WeatherCode_Origin,precipMM_Origin,Visibility_Origin,Pressure_Origin,Cloudcover_Origin,DewPointF_Origin,WindGustKmph_Origin,tempF_Origin,WindChillF_Origin,Humidity_Origin,WindSpeedKmph_Dest,WindDirDegree_Dest,WeatherCode_Dest,precipMM_Dest,Visibility_Dest,Pressure_Dest,Cloudcover_Dest,DewPointF_Dest,WindGustKmph_Dest,tempF_Dest,WindChillF_Dest,Humidity_Dest
1,2016,1,1,1,2016-01-01,13204,MCO,12478,JFK,1330,1408,38.0,1.0,1603,1628,25.0,1.0,1400,1600,8,227,122,0.1,8,1019,79,72,10,79,78,83,21,254,113,0.0,10,1016,0,33,24,42,33,74
11,2016,1,1,1,2016-01-01,14747,SEA,14771,SFO,724,759,35.0,1.0,946,1010,24.0,1.0,700,1000,7,106,113,0.0,10,1030,0,24,8,34,29,67,19,72,113,0.0,10,1023,0,25,30,35,25,70
13,2016,1,1,1,2016-01-01,13303,MIA,14771,SFO,700,737,37.0,1.0,1021,1047,26.0,1.0,700,1000,9,109,266,0.2,2,1019,100,70,17,72,72,96,19,72,113,0.0,10,1023,0,25,30,35,25,70
17,2016,1,1,1,2016-01-01,14107,PHX,14771,SFO,839,911,32.0,1.0,950,1018,28.0,1.0,900,1000,7,48,113,0.0,10,1021,0,31,14,42,38,69,19,72,113,0.0,10,1023,0,25,30,35,25,70
19,2016,1,1,1,2016-01-01,13930,ORD,14771,SFO,653,709,16.0,1.0,941,1015,34.0,1.0,700,1000,21,256,122,0.0,10,1024,66,15,32,20,6,86,19,72,113,0.0,10,1023,0,25,30,35,25,70


## Splitting X and y 

In [5]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
print(X_test.shape)
X_test.head()

(116418, 29)


Unnamed: 0,OriginAirportID,DestAirportID,DepTime,DepDelayMinutes,DepDel15,WindSpeedKmph_Origin,WindDirDegree_Origin,WeatherCode_Origin,precipMM_Origin,Visibility_Origin,Pressure_Origin,Cloudcover_Origin,DewPointF_Origin,WindGustKmph_Origin,tempF_Origin,WindChillF_Origin,Humidity_Origin,WindSpeedKmph_Dest,WindDirDegree_Dest,WeatherCode_Dest,precipMM_Dest,Visibility_Dest,Pressure_Dest,Cloudcover_Dest,DewPointF_Dest,WindGustKmph_Dest,tempF_Dest,WindChillF_Dest,Humidity_Dest
1616863,10397,11618,1944,114.0,1.0,11,280,176,0.6,10,1017,30,76,14,90,90,64,15,221,176,2.3,8,1012,28,74,27,76,76,93
13641,14107,12889,2209,49.0,1.0,3,281,113,0.0,10,1021,1,39,4,53,53,52,7,319,113,0.0,10,1025,0,33,11,37,37,63
1619835,12892,13930,948,9.0,0.0,7,166,113,0.0,10,1014,0,64,8,72,72,75,22,240,113,0.0,10,1016,0,71,26,84,85,64
1499982,11292,13930,1713,0.0,0.0,24,50,353,4.8,8,1009,64,48,32,62,64,55,11,87,353,0.1,10,1018,63,26,18,46,46,44
667291,12266,12892,2042,77.0,1.0,14,155,113,0.0,10,1013,0,74,18,86,86,58,12,262,119,0.0,10,1016,69,63,16,70,69,81


## Training Regressor

In [7]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## Scores for each Arrival delay interval

In [8]:
testDF = X_test.merge(y_test, left_index=True, right_index=True)
print(testDF.shape)
for limit in limits:
  temp = filterDF(testDF, limit[0], limit[1])
  test(model, temp[features], temp[target])

(116418, 30)
[0, 100]
(97156, 29)
MAE: 10.484335544063645
RMSE: 13.678914833656444


[100, 200]
(14595, 29)
MAE: 17.37699214342811
RMSE: 25.590999411243907


[200, 500]
(4289, 29)
MAE: 19.161091163441363
RMSE: 30.024664005381403


[500, 1000]
(330, 29)
MAE: 16.432727272727274
RMSE: 23.835391053361526


[1000, 2000]
(47, 29)
MAE: 18.45021276595743
RMSE: 24.281772079466293


