# Hyperparameter Tuning for Random Forest Model

In this notebook, I perform hyperparameter tuning for a Random Forest model used to 
predict FPL player total points.

I experiment with:
- **RandomizedSearchCV** to broadly explore the parameter space.
- **GridSearchCV** to refine tuning around promising parameter ranges.

The training data is loaded from the `data/model_ready` folder, which contains 
model-ready player datasets for each season.

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(project_root)

print("Current working directory set to project root:", os.getcwd())

INPUT_DIR = "data/model_ready"
MODEL_PATH = "models/random_forest_model.pkl"


Current working directory set to project root: c:\FPL AI\FPL_AI_V1b


In [4]:
def load_model_ready_data():
    """
    Load all preprocessed training CSVs from INPUT_DIR and combine them.
    """
    dfs = []
    for file in os.listdir(INPUT_DIR):
        if file.endswith("_model_ready.csv"):
            path = os.path.join(INPUT_DIR, file)
            df = pd.read_csv(path)
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

df = load_model_ready_data()
print(f"Data loaded with shape: {df.shape}")
df.head()

Data loaded with shape: (103898, 31)


Unnamed: 0,element_type,total_points,gw,prev_total_points,prev_goals_scored,prev_assists,prev_minutes,prev_goals_conceded,prev_creativity,prev_influence,...,current_minutes,current_goals_conceded,current_creativity,current_influence,current_threat,current_bonus,current_ict_index,current_clean_sheets,current_cards_per_90,current_points_per_90
0,0.0,10,1,131.0,0.0,0.0,3131.0,37.0,0.0,702.2,...,90,2,0.0,11.8,0.0,0,1.2,0,0.0,0.0
1,0.0,10,2,131.0,0.0,0.0,3131.0,37.0,0.0,702.2,...,180,4,0.0,36.0,0.0,0,3.6,0,0.0,0.0
2,0.0,10,3,131.0,0.0,0.0,3131.0,37.0,0.0,702.2,...,270,9,0.0,79.0,0.0,0,7.9,0,0.0,1.33
3,0.0,10,4,131.0,0.0,0.0,3131.0,37.0,0.0,702.2,...,270,9,0.0,79.0,0.0,0,7.9,0,0.0,1.33
4,0.0,10,5,131.0,0.0,0.0,3131.0,37.0,0.0,702.2,...,270,9,0.0,79.0,0.0,0,7.9,0,0.0,1.33


In [6]:
def rf_randomized_search(df):
    X = df.drop(columns=["total_points"])
    y = df["total_points"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    param_dist = {
        'n_estimators': [200, 500, 800],
        'max_depth': [None, 10, 20, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    model = RandomForestRegressor(random_state=42, n_jobs=-1)

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=30,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=2,
        random_state=42,
        n_jobs=-1,
    )

    random_search.fit(X_train, y_train)

    print("Best parameters (Randomized):", random_search.best_params_)
    print("Best CV MSE:", -random_search.best_score_)

    return random_search.best_estimator_

best_random = rf_randomized_search(df)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters (Randomized): {'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40}
Best CV MSE: 108.52761045751052


In [7]:
def rf_grid_search(df):
    X = df.drop(columns=["total_points"])
    y = df["total_points"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    param_grid = {
        'n_estimators': [700, 800, 1000],
        'max_depth': [None, 30, 40],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'max_features': ['log2']
    }

    model = RandomForestRegressor(random_state=42, n_jobs=-1)

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=2,
        n_jobs=-1,
    )

    grid_search.fit(X_train, y_train)

    print("Best parameters (Grid):", grid_search.best_params_)
    print("Best CV MSE:", -grid_search.best_score_)

    return grid_search.best_estimator_

best_grid = rf_grid_search(df)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters (Grid): {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best CV MSE: 108.36911909738258


In [8]:
# Evaluate both models on a hold-out set
X = df.drop(columns=["total_points"])
y = df["total_points"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

y_pred_random = best_random.predict(X_test)
y_pred_grid = best_grid.predict(X_test)

print("Randomized Search R²:", r2_score(y_test, y_pred_random))
print("Grid Search R²:", r2_score(y_test, y_pred_grid))

Randomized Search R²: 0.9623509735414786
Grid Search R²: 0.962297466969381
