In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
np.random.seed(42)

In [2]:
#Read in complete csv
data = pd.read_csv('Air2009-2019_Complete.csv')

In [3]:
#Data normalization, normalizes each features in the range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

In [4]:
#Data split in a 60/20/20 ratio
total_size = len(data)
train_size = int(total_size * 0.6)
val_size = int(total_size * 0.2)
test_size = total_size - train_size - val_size

train_df = data[:train_size]
val_df = data[train_size:train_size+val_size]
test_df = data[train_size+val_size:]
print(len(train_df))
print(len(val_df))
print(len(test_df))

57711
19237
19237


In [5]:
#Model Hyperpamater definition and assignment
n_steps = 24
n_features = 7
C = 1
kernel = 'poly'
degree = 2 
gamma = 'scale'
coef0 = 1
epsilon = 0.01

In [6]:
#Function creates sequences from the data based on the amount of n_steps
def sequence_data(data, n_steps):
    x, y = [], []
    for i in range(n_steps, len(data)):
        x.append(data[i-n_steps:i, :])
        y.append(data[i, 4]) #i,passenger column
    x, y = np.array(x), np.array(y)
    return x, y

In [7]:
#Create sequences
x_train, y_train = sequence_data(train_df, n_steps)
x_val, y_val = sequence_data(val_df, n_steps)
x_test, y_test = sequence_data(test_df, n_steps)

In [8]:
#Make input data 2D
x_train = x_train.reshape(-1, n_steps * n_features)
x_val = x_val.reshape(-1, n_steps * n_features)
x_test = x_test.reshape(-1, n_steps * n_features)

In [9]:
#Model Architecture
svr = SVR(C=C, 
          kernel=kernel,
          degree=degree,
          gamma=gamma,
          coef0=coef0,
          epsilon=epsilon
         )

In [10]:
#Fit training data
svr.fit(x_train, y_train)

In [11]:
#Predictions for the three datasets
y_train_pred = svr.predict(x_train)
y_val_pred = svr.predict(x_val)
y_test_pred = svr.predict(x_test)

In [12]:
#Calc MSE and RMSE
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
print('Train Score: {:.5f} MSE ({:.5f} RMSE)'.format(train_mse, np.sqrt(train_mse)))
print('Validation Score: {:.5f} MSE ({:.5f} RMSE)'.format(val_mse, np.sqrt(val_mse)))
print('Test Score: {:.5f} MSE ({:.5f} RMSE)'.format(test_mse, np.sqrt(test_mse)))

Train Score: 0.00264 MSE (0.05135 RMSE)
Validation Score: 0.00395 MSE (0.06283 RMSE)
Test Score: 0.00511 MSE (0.07147 RMSE)


In [13]:
#Dummy arrays
y_test_dummy = np.zeros((y_test.shape[0], 7))
y_pred_dummy = np.zeros((y_test_pred.shape[0], 7))

In [14]:
#Insert passenger data to the dummy arrays
y_test_dummy[:, 4] = y_test.ravel()
y_pred_dummy[:, 4] = y_test_pred.ravel()

In [15]:
#Inverse the normalization
y_test_inv = np.round(scaler.inverse_transform(y_test_dummy)[:, 4])
y_pred_inv = np.round(scaler.inverse_transform(y_pred_dummy)[:, 4])

In [16]:
index = 0
for i in range(48):
    print(f"Index: {index}, Actual: {y_test_inv[i]}, Predicted: {y_pred_inv[i]}")
    index += 1
index = 0

Index: 0, Actual: 1541.0, Predicted: 1701.0
Index: 1, Actual: 2097.0, Predicted: 1859.0
Index: 2, Actual: 2398.0, Predicted: 1801.0
Index: 3, Actual: 2540.0, Predicted: 2749.0
Index: 4, Actual: 4294.0, Predicted: 3357.0
Index: 5, Actual: 5761.0, Predicted: 4147.0
Index: 6, Actual: 5273.0, Predicted: 6533.0
Index: 7, Actual: 6506.0, Predicted: 5609.0
Index: 8, Actual: 6654.0, Predicted: 6395.0
Index: 9, Actual: 6876.0, Predicted: 7080.0
Index: 10, Actual: 7036.0, Predicted: 7590.0
Index: 11, Actual: 7919.0, Predicted: 6912.0
Index: 12, Actual: 7723.0, Predicted: 6582.0
Index: 13, Actual: 6387.0, Predicted: 5708.0
Index: 14, Actual: 5894.0, Predicted: 5746.0
Index: 15, Actual: 4796.0, Predicted: 4433.0
Index: 16, Actual: 2184.0, Predicted: 2071.0
Index: 17, Actual: 499.0, Predicted: 297.0
Index: 18, Actual: 181.0, Predicted: -147.0
Index: 19, Actual: 0.0, Predicted: -63.0
Index: 20, Actual: 0.0, Predicted: -110.0
Index: 21, Actual: 0.0, Predicted: 8.0
Index: 22, Actual: 99.0, Predicted: 