In [1]:
# We will use the Random Forest Regression model based on our findings in the Model Selection file

In [2]:
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
    
from utils import split_data, normalize, dimensionality_reduction, fit_model, evaluate_model

In [3]:
training_df = pd.read_csv('training.csv')

In [4]:
def model_tuning(training_df):
    '''
    Determine model performance for difference hyperparameters - n_estimators, max_depth.
    This portion of the code was executed on Google Colab to speed up the process.
    '''
    X_train, X_validation, y_train, y_validation = split_data(training_df) # split data into training and validation
    X_train, X_validation = normalize(X_train, X_validation) # normalize data
    X_train, X_validation = dimensionality_reduction(X_train, X_validation) # perform dimensionality reduction on data
        
    performance = {}
    for n in range(1, 101):
        for d in range(1, 51):
            model = RandomForestRegressor(n_estimators = n, max_depth = d, random_state = 0)
            k_cv_score = cross_val_score(model, X_train, y_train, cv=10, scoring='r2').mean() # performing k fold cross validation, k = 10, scoring metric - r^2
            performance[(n,d)] = k_cv_score # recording performance for every value of n_estimators and max_depth hyperparameters in specified ranges
    
    return performance
    


In [5]:
performance = model_tuning(training_df) # get performance for every pair of n_estimators, max_depth in ranges [1,100] and [1,50] respectively
best_n, best_d = max(performance, key=performance.get) # get hyperparameter values for best performance
print("Best Hyperparameters for Random Forest Regression Model:\n1. No. of estimators = {}\n2. Max depth = {}".format(best_n, best_d))


Best Hyperparameters for Random Forest Regression Model:
1. No. of estimators = 80
2. Max depth = 20
