In [None]:
# Import necessary modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV

In [None]:
# Reading in training, validation and test data
trainset = []
valset = []
testset = []

with open("Datasets/19902009processedtraining.csv") as f:
  for row in f:
    trainset.append(row.strip("\n").split(","))

with open("Datasets/20102014processedvalidation.csv") as f:
  for row in f:
    valset.append(row.strip(",\n").split(","))

with open("Datasets/20152019processedtesting.csv") as f:
  for row in f:
    testset.append(row.strip("\n").split(","))

x_train = []
y_train = []
x_val = []
y_val = []
x_test = []
y_test = []

for record in trainset:
  newrec = []
  if record != ['']:
    newrec = record[:5] + record[6:]
    x_train.append(newrec)
    y_train.append(record[5])

for record in valset:
  newrec = []
  if record != ['']:
    newrec = record[:5] + record[6:]
    x_val.append(newrec)
    y_val.append(record[5])

for record in testset:
  newrec = []
  if record != ['']:
    newrec = record[:5] + record[6:]
    x_test.append(newrec)
    y_test.append(record[5])

In [None]:
#gridsearch hyperparameter tuning (Takes 40-50min to run)

model = RandomForestRegressor(random_state=42) #defining the model

param_grid = { #creating parameters to test
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10],
}

grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, verbose=1)
grid.fit(x_train, y_train) #perform grid search

print("Best parameters:", grid.best_params_) #getting best model
best_model = grid.best_estimator_

predictions = best_model.predict(x_test) #evaluating best model
mape = mean_absolute_percentage_error(y_test, predictions)
print("Validated model MAPE:", mape)
print(grid.best_params_)

In [None]:
# Merging the Train and Validation datasets for retraining
for row in x_val:
    x_train.append(row)

for row in y_val:
    y_train.append(row)

In [None]:
# Retraining and testing on test set
final_RFmodel = RandomForestRegressor(**grid.best_params_, random_state=42)
final_RFmodel.fit(x_train, y_train)

final_mape = mean_absolute_percentage_error(y_test, final_RFmodel.predict(x_test))
print("Final MAPE:", final_mape)