In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
np.random.seed(42)

In [2]:
#Read in complete csv
data = pd.read_csv('Air2009-2019_Complete.csv')

In [3]:
#Data normalization, normalizes each features in the range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

In [4]:
#Data split in a 60/20/20 ratio
total_size = len(data)
train_size = int(total_size * 0.6)
val_size = int(total_size * 0.2)
test_size = total_size - train_size - val_size

train_df = data[:train_size]
val_df = data[train_size:train_size+val_size]
test_df = data[train_size+val_size:]
print(len(train_df))
print(len(val_df))
print(len(test_df))

57711
19237
19237


In [5]:
#Model Hyperpamater definition and assignment
n_steps = 24
n_features = 7
n_estimators = 100
max_depth = 10
min_samples_split = 2
min_samples_leaf = 2
max_features = 0.75

In [6]:
#Function creates sequences from the data based on the amount of n_steps
def sequence_data(data, n_steps):
    x, y = [], []
    for i in range(n_steps, len(data)):
        x.append(data[i-n_steps:i, :])
        y.append(data[i, 4]) #i,passenger column
    x, y = np.array(x), np.array(y)
    return x, y

In [7]:
#Create sequences
x_train, y_train = sequence_data(train_df, n_steps)
x_val, y_val = sequence_data(val_df, n_steps)
x_test, y_test = sequence_data(test_df, n_steps)

In [8]:
#Make input data 2D
x_train = x_train.reshape(-1, n_steps * n_features)
x_val = x_val.reshape(-1, n_steps * n_features)
x_test = x_test.reshape(-1, n_steps * n_features)

In [9]:
#Model Architecture and fit
rf_model = RandomForestRegressor(n_estimators=n_estimators, 
                                 max_depth=max_depth, 
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf, 
                                 max_features=max_features, 
                                 random_state=42)
rf_model.fit(x_train, y_train)

In [10]:
#Predictions for the three datasets
y_train_pred_rf = rf_model.predict(x_train)
y_val_pred_rf = rf_model.predict(x_val)
y_test_pred_rf = rf_model.predict(x_test)


In [11]:
#Calc MSE and RMSE
train_mse_rf = mean_squared_error(y_train, y_train_pred_rf)
val_mse_rf = mean_squared_error(y_val, y_val_pred_rf)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
print('Train Score: {:.5f} MSE ({:.5f} RMSE)'.format(train_mse_rf, np.sqrt(train_mse_rf)))
print('Validation Score: {:.5f} MSE ({:.5f} RMSE)'.format(val_mse_rf, np.sqrt(val_mse_rf)))
print('Test Score: {:.5f} MSE ({:.5f} RMSE)'.format(test_mse_rf, np.sqrt(test_mse_rf)))

Train Score: 0.00209 MSE (0.04569 RMSE)
Validation Score: 0.00409 MSE (0.06396 RMSE)
Test Score: 0.00543 MSE (0.07368 RMSE)


In [12]:
#Dummy arrays
y_test_dummy_rf = np.zeros((y_test.shape[0], 7))
y_pred_dummy_rf = np.zeros((y_test_pred_rf.shape[0], 7))


In [13]:
#Insert passenger data to the dummy arrays
y_test_dummy_rf[:, 4] = y_test.ravel()
y_pred_dummy_rf[:, 4] = y_test_pred_rf.ravel()

In [14]:
#Inverse the normalization
y_test_inv_rf = np.round(scaler.inverse_transform(y_test_dummy_rf)[:, 4])
y_pred_inv_rf = np.round(scaler.inverse_transform(y_pred_dummy_rf)[:, 4])


In [15]:
index = 0
for i in range(48):
    print(f"Index: {index}, Actual: {y_test_inv_rf[i]}, Predicted: {y_pred_inv_rf[i]}")
    index += 1
index = 0

Index: 0, Actual: 1541.0, Predicted: 1152.0
Index: 1, Actual: 2097.0, Predicted: 1928.0
Index: 2, Actual: 2398.0, Predicted: 2324.0
Index: 3, Actual: 2540.0, Predicted: 2580.0
Index: 4, Actual: 4294.0, Predicted: 2973.0
Index: 5, Actual: 5761.0, Predicted: 3738.0
Index: 6, Actual: 5273.0, Predicted: 4776.0
Index: 7, Actual: 6506.0, Predicted: 4822.0
Index: 8, Actual: 6654.0, Predicted: 5697.0
Index: 9, Actual: 6876.0, Predicted: 6510.0
Index: 10, Actual: 7036.0, Predicted: 6690.0
Index: 11, Actual: 7919.0, Predicted: 6337.0
Index: 12, Actual: 7723.0, Predicted: 6432.0
Index: 13, Actual: 6387.0, Predicted: 5592.0
Index: 14, Actual: 5894.0, Predicted: 5069.0
Index: 15, Actual: 4796.0, Predicted: 4248.0
Index: 16, Actual: 2184.0, Predicted: 2144.0
Index: 17, Actual: 499.0, Predicted: 639.0
Index: 18, Actual: 181.0, Predicted: 58.0
Index: 19, Actual: 0.0, Predicted: 58.0
Index: 20, Actual: 0.0, Predicted: 17.0
Index: 21, Actual: 0.0, Predicted: 18.0
Index: 22, Actual: 99.0, Predicted: 71.0