In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from scipy.io import arff
from sklearn.model_selection import GridSearchCV
from pprint import pprint

model_data = pd.read_csv('MLRdata_without_clubs.csv', sep=',')
y = model_data['fee_cleaned']
X = model_data.drop(columns =['fee_cleaned'])
y_log = np.log(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 1)

#Standard Scaling only on the train set rather than the whole dataset (https://stats.stackexchange.com/questions/111467/is-it-necessary-to-scale-the-target-value-in-addition-to-scaling-features-for-re)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

#https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 1)
rf.fit(X_train, y_train)
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [9]:
from sklearn.model_selection import cross_validate
base_scores = cross_validate(rf, X_train, y_train, cv=10, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print('RF (Base): Cross Validation')
print('R^2:', base_scores['test_r2'].mean())
print('RMSE:', -1 * base_scores['test_neg_root_mean_squared_error'].mean())

RF (Base): Cross Validation
R^2: 0.6172674143067256
RMSE: 6.887389039614378


In [10]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 50)]
n_estimators.append(100)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 50)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2, 20, num = 18)]
min_samples_split.append(2)
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(1, 50, num = 50)]
min_samples_leaf.append(1)
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 1)
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
pprint(rf_random.best_params_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
{'bootstrap': True,
 'max_depth': 126,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 176}


In [11]:
rf_random_best_params = RandomForestRegressor(**rf_random.best_params_)
rf_random_best_params
random_scores = cross_validate(rf_random_best_params, X_train, y_train, cv=10, scoring=('r2','neg_root_mean_squared_error'), return_train_score=True)
print('RF (Random): Cross Validation')
print('R^2:', random_scores['test_r2'].mean())
print('RMSE:', -1 * random_scores['test_neg_root_mean_squared_error'].mean())

RF (Random): Cross Validation
R^2: 0.6321526032331317
RMSE: 6.751350584874527


In [8]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 80, 126],
    'max_features': ['auto'],
    'min_samples_leaf': [2, 4, 8],
    'min_samples_split': [2, 6, 10],
    'n_estimators': [120,150,176]
}
# Create a base model
rf = RandomForestRegressor(random_state = 1)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, return_train_score=True)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


{'bootstrap': True,
 'max_depth': 75,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 130}

In [9]:
grid_search_best_params = RandomForestRegressor(**grid_search.best_params_)
grid_search_best_params
grid_search = cross_validate(grid_search_best_params, X_train, y_train, cv=10, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print('RF (Grid1): Cross Validation')
print('R^2:', grid_search['test_r2'].mean())
print('RMSE:', -1 * grid_search['test_neg_root_mean_squared_error'].mean())

RF (Grid1): Cross Validation
R^2: 0.6215322634545453
RMSE: 6.833464204676934


In [11]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [60,126],
    'max_features': ['auto'],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'n_estimators': [176, 150]
}
# Create a base model
rf = RandomForestRegressor(random_state = 1)

# Instantiate the grid search model
grid_search2 = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, return_train_score=True)
grid_search2.fit(X_train, y_train)
grid_search2.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


{'bootstrap': True,
 'max_depth': 75,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 176}

In [12]:
grid_search2_best_params = RandomForestRegressor(**grid_search2.best_params_)
grid_search2_best_params
#random_state added here
grid_search2 = cross_validate(grid_search2_best_params, X_train, y_train, cv=10, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print('RF (Grid2): Cross Validation')
print('R^2:', grid_search2['test_r2'].mean())
print('RMSE:', -1 * grid_search2['test_neg_root_mean_squared_error'].mean())

RF (Grid2): Cross Validation
R^2: 0.6274883440037197
RMSE: 6.786024518453866


In [12]:
print('Performance on validation set')
print('RF (Base): Cross Validation')
print('R^2:', base_scores['test_r2'].mean())
print('RMSE:', -1 * base_scores['test_neg_root_mean_squared_error'].mean())
print()
print('RF (Random): Cross Validation')
print('R^2:', random_scores['test_r2'].mean())
print('RMSE:', -1 * random_scores['test_neg_root_mean_squared_error'].mean())
print()
print('RF (Grid1): Cross Validation')
print('R^2:', grid_search['test_r2'].mean())
print('RMSE:', -1 * grid_search['test_neg_root_mean_squared_error'].mean())
print()
print('RF (Grid2): Cross Validation')
print('R^2:', grid_search2['test_r2'].mean())
print('RMSE:', -1 * grid_search2['test_neg_root_mean_squared_error'].mean())
print()

Performance on validation set
RF (Base): Cross Validation
R^2: 0.6172674143067256
RMSE: 6.887389039614378

RF (Random): Cross Validation
R^2: 0.6283728809866675
RMSE: 6.777115053969661

RF (Grid1): Cross Validation
R^2: 0.6217943867431124
RMSE: 6.855779859278131

RF (Grid2): Cross Validation
R^2: 0.6261777291191472
RMSE: 6.807110377515014



In [13]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
grid_search2_best_params.fit(X_train, y_train)
predictions_test_set = grid_search2_best_params.predict(X_test)
r2_score_test_set = r2_score(y_test, predictions_test_set)
mse_test_set = mean_squared_error(y_test, predictions_test_set)
rmse_test_set = math.sqrt(mse_test_set)
print('Performance on test set')
print('R^2:', r2_score_test_set)
print('RMSE:', rmse_test_set)

Performance on test set
R^2: 0.6718009184705624
RMSE: 6.045506349332103


In [4]:
from sklearn.ensemble import RandomForestRegressor
final_rf = RandomForestRegressor(bootstrap = True, max_depth = 36, max_features = 'auto', min_samples_leaf = 2, min_samples_split = 5, n_estimators = 170)

final_rf_cv = cross_validate(final_rf, X_train, y_train, cv=10, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
print('RF (Final): Cross Validation')
print('R^2:', final_rf_cv['test_r2'].mean())
print('RMSE:', -1 * final_rf_cv['test_neg_root_mean_squared_error'].mean())

final_rf.fit(X_train, y_train)

predictions_test_set = final_rf.predict(X_test)
r2_score_test_set = r2_score(y_test, predictions_test_set)
mse_test_set = mean_squared_error(y_test, predictions_test_set)
rmse_test_set = math.sqrt(mse_test_set)
print('Performance on test set')
print('R^2:', r2_score_test_set)
print('RMSE:', rmse_test_set)

RF (Final): Cross Validation
R^2: 0.6257825021585689
RMSE: 6.805911927242777


NameError: name 'r2_score' is not defined