# Regression Models
Comparing the goodness of the following Regression Models
* Linear Regression
* Extra Trees
* XGBoost
* Random Forest
* SGDRegressor (SVM) 

## Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import r2_score as R2, mean_absolute_error as MAE, mean_squared_error as  MSE

## Utilities

In [None]:
features = ['OriginAirportID', 'DestAirportID', 'DepTime',
       'DepDelayMinutes', 'DepDel15', 'WindSpeedKmph_Origin',
       'WindDirDegree_Origin', 'WeatherCode_Origin', 'precipMM_Origin',
       'Visibility_Origin', 'Pressure_Origin', 'Cloudcover_Origin',
       'DewPointF_Origin', 'WindGustKmph_Origin', 'tempF_Origin',
       'WindChillF_Origin', 'Humidity_Origin', 'WindSpeedKmph_Dest',
       'WindDirDegree_Dest', 'WeatherCode_Dest', 'precipMM_Dest',
       'Visibility_Dest', 'Pressure_Dest', 'Cloudcover_Dest', 'DewPointF_Dest',
       'WindGustKmph_Dest', 'tempF_Dest', 'WindChillF_Dest', 'Humidity_Dest']
       
target = 'ArrDelayMinutes'

for feature in features:
  print(feature, end=", ")

OriginAirportID, DestAirportID, DepTime, DepDelayMinutes, DepDel15, WindSpeedKmph_Origin, WindDirDegree_Origin, WeatherCode_Origin, precipMM_Origin, Visibility_Origin, Pressure_Origin, Cloudcover_Origin, DewPointF_Origin, WindGustKmph_Origin, tempF_Origin, WindChillF_Origin, Humidity_Origin, WindSpeedKmph_Dest, WindDirDegree_Dest, WeatherCode_Dest, precipMM_Dest, Visibility_Dest, Pressure_Dest, Cloudcover_Dest, DewPointF_Dest, WindGustKmph_Dest, tempF_Dest, WindChillF_Dest, Humidity_Dest, 

In [None]:
def predict(model, X_train, X_test, y_train, y_test, string):
  print(string, "")
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)

  print("R2:", R2(y_test, predictions))
  print("MAE:", MAE(y_test, predictions))
  print("RMSE:", np.sqrt(MSE(y_test, predictions)))

  print("\n\n\n")

## Dataset

In [None]:
data = pd.read_csv("drive/MyDrive/Data/dataset.csv")
data = data[data["ArrDel15"] == 1]
print(data.shape)

(388058, 43)


## Separating X and y

In [None]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
scaler.fit(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Models

In [None]:
predict(LinearRegression(), X_train, X_test, y_train, y_test, "LinearRegression")
predict(ExtraTreesRegressor(), X_train, X_test, y_train, y_test, "ExtraTreesRegressor")
predict(XGBRegressor(), X_train, X_test, y_train, y_test, "XGBRegressor")
predict(RandomForestRegressor(), X_train, X_test, y_train, y_test, "RandomForestRegressor")
predict(SGDRegressor(), X_train_scaled, X_test_scaled, y_train, y_test, "SGDRegressor")

LinearRegression 
R2: 0.9401241100353579
MAE: 12.178193610196125
RMSE: 17.588095946430137




ExtraTreesRegressor 
R2: 0.9461500690605955
MAE: 11.743619929907744
RMSE: 16.679591520117405




XGBRegressor 
R2: 0.9449727831848853
MAE: 11.614156521995406
RMSE: 16.86093324865755




RandomForestRegressor 
R2: 0.9468064349795956
MAE: 11.675270654636128
RMSE: 16.5776277795784




SGDRegressor 
R2: 0.9399593328134003
MAE: 12.157829456362915
RMSE: 17.612280358888132




