# Random Forest Regression

In [2]:
from functions import *

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime

sns.set_theme(style="darkgrid")


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV



In [None]:
flights_test = pd.read_csv('./flights_test.csv')
flights_train = pd.read_csv('./flights_train.csv')

flights_train = flights_train.sort_values(['AIRLINE', 'FLIGHT_NUMBER']).fillna(method='backfill')
flights_test = flights_test.sort_values(['AIRLINE', 'FLIGHT_NUMBER']).fillna(method='backfill')

print(len(flights_test))
print(len(flights_train))

In [None]:
# Feature Selection
flights_train = preprocessing(flights_train)
flights_test = preprocessing(flights_test)

sns.scatterplot(y='ARRIVAL_DELAY', x='initial_delay', data= training_set)

In [None]:
origin_features = [
            'initial_delay', 
            'initial_delay_mean',
            # 'ORIGIN_AIRPORT',
            # 'DESTINATION_AIRPORT',
            'DISTANCE', 
            'DAY',
            'DAY_OF_WEEK',
            'MONTH',
            'ARRIVAL_DELAY'
            ]

arrival_delay = 'ARRIVAL_DELAY'

# using the full set for training
training_set = flights_train[origin_features].sample(300000)

# separating the variable that is to be predicted by the model from the rest
X = training_set.drop(arrival_delay, axis=1)
y = training_set[arrival_delay]

features = X.columns

# built a validation set on flights_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training 

In [None]:

rfr = RandomForestRegressor(n_estimators=12, max_depth=8, random_state=42)

# {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 12}
rf = Pipeline(steps=[#('preprocessor', preprocessor),
                     ('regressor', rfr)])

# cv_scores = cross_val_score(rf, X, y, cv=5)
# print(cv_scores)
rf.fit(X_train, y_train)

## Evaluation

In [None]:
y_pred = rf.predict(X_test)
score = rf.score(X_test, y_test)
mean_squared_error = mse(y_test, y_pred)

print("R2: ", round(score, 3))
print("Mean Squared Error: ", round(mean_squared_error, 3))

## Creating Submission

In [None]:
predictions = rf.predict(flights_test)

submission = pd.DataFrame({'id': flights_test['id'], 'ARRIVAL_DELAY': predictions})
submission.to_csv('submission_5.csv', index=False)

# submission must have 514,384 rows
print(len(submission))

## Hyperparameter Tuning

With the help of GridSearch we can try to fine-tune the random forest regression, determining what is the optimal number of trees to get the best score.

Depending on the amount of hyperparameters you can defined in the param_grid, this can take a while...

In [None]:
param_grid = {
    'n_estimators': [12, 21],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8]
    # 'criterion': ['gini', 'entropy']
}

CV = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5)
CV.fit(X_train, y_train)
print(CV.best_params_)
print(CV.best_score_)