In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

In [2]:
df = pd.read_csv("../data/cleaned_data_set_v1.csv", sep=",")
df

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,BGG Rank,Complexity Average,Owned Users,Mechanics,Domains
0,1,4,120.0,14,42055,8.79244,1,3.8604,68323.0,"Action Queue, Action Retrieval, Campaign / Bat...","Strategy Games, Thematic Games"
1,2,4,60.0,13,41643,8.61278,2,2.8405,65294.0,"Action Points, Cooperative Game, Hand Manageme...","Strategy Games, Thematic Games"
2,2,4,120.0,14,19217,8.66337,3,3.9129,28785.0,"Hand Management, Income, Loans, Market, Networ...",Strategy Games
3,1,5,120.0,12,64864,8.43254,4,3.2406,87099.0,"Card Drafting, Drafting, End Game Bonuses, Han...",Strategy Games
4,3,6,,14,13468,,5,4.2219,16831.0,"Action Drafting, Area Majority / Influence, Ar...","Strategy Games, Thematic Games"
...,...,...,...,...,...,...,...,...,...,...,...
20338,2,2,30.0,4,1340,2.27856,20340,1.0000,427.0,Unknown,Children's Games
20339,2,99,60.0,5,2154,2.85331,20341,1.0455,1533.0,"Betting and Bluffing, Bingo, Pattern Recognition",Party Games
20340,2,4,30.0,3,4006,3.17792,20342,1.0779,5788.0,Roll / Spin and Move,Children's Games
20341,2,6,30.0,3,3783,2.85567,20343,1.0201,4400.0,"Dice Rolling, Grid Movement, Race, Roll / Spin...",Children's Games


In [3]:
features = ["Min Players", "Max Players", "Play Time", "Min Age", "Users Rated", 
            "BGG Rank", "Complexity Average", "Owned Users"]

df_train = df.dropna(subset=["Rating Average"])  
df_missing = df[df["Rating Average"].isna()] 

X_train = df_train[features]
y_train = df_train["Rating Average"]

X_missing = df_missing[features]  

X_train_full, X_val, y_train_full, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [4]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()),                
    ('regressor', RandomForestRegressor(random_state=42)) 
])

param_grid = {
    'regressor__n_estimators': [50, 100, 200],  
    'regressor__max_depth': [None, 10, 20, 30], 
    'regressor__min_samples_split': [2, 5, 10], 
    'regressor__min_samples_leaf': [1, 2, 4],    
    'regressor__max_features': [None, 'sqrt', 'log2']  
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  
    cv=3,  
    n_jobs=-1,  
    verbose=2   
)

grid_search.fit(X_train_full, y_train_full)  

best_model = grid_search.best_estimator_  
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'regressor__max_depth': 20, 'regressor__max_features': None, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}


In [5]:
y_pred = best_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)  
print(f"Mean Squared Error (MSE): {mse}")

rmse = np.sqrt(mse)  
print(f"Root Mean Squared Error (RMSE): {rmse}")

r2 = r2_score(y_val, y_pred)  
print(f"R-squared (R²): {r2}")

mae = mean_absolute_error(y_val, y_pred)  
print(f"Mean Absolute Error (MAE): {mae}")

evs = explained_variance_score(y_val, y_pred)  
print(f"Explained Variance Score: {evs}")

Mean Squared Error (MSE): 0.06037179086002792
Root Mean Squared Error (RMSE): 0.24570671716505416
R-squared (R²): 0.933123800890686
Mean Absolute Error (MAE): 0.13419714064138916
Explained Variance Score: 0.9331593386231605


In [6]:
y_missing_pred = best_model.predict(X_missing)  

df.loc[df["Rating Average"].isna(), "Rating Average"] = y_missing_pred

df

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,BGG Rank,Complexity Average,Owned Users,Mechanics,Domains
0,1,4,120.0,14,42055,8.792440,1,3.8604,68323.0,"Action Queue, Action Retrieval, Campaign / Bat...","Strategy Games, Thematic Games"
1,2,4,60.0,13,41643,8.612780,2,2.8405,65294.0,"Action Points, Cooperative Game, Hand Manageme...","Strategy Games, Thematic Games"
2,2,4,120.0,14,19217,8.663370,3,3.9129,28785.0,"Hand Management, Income, Loans, Market, Networ...",Strategy Games
3,1,5,120.0,12,64864,8.432540,4,3.2406,87099.0,"Card Drafting, Drafting, End Game Bonuses, Han...",Strategy Games
4,3,6,,14,13468,8.453179,5,4.2219,16831.0,"Action Drafting, Area Majority / Influence, Ar...","Strategy Games, Thematic Games"
...,...,...,...,...,...,...,...,...,...,...,...
20338,2,2,30.0,4,1340,2.278560,20340,1.0000,427.0,Unknown,Children's Games
20339,2,99,60.0,5,2154,2.853310,20341,1.0455,1533.0,"Betting and Bluffing, Bingo, Pattern Recognition",Party Games
20340,2,4,30.0,3,4006,3.177920,20342,1.0779,5788.0,Roll / Spin and Move,Children's Games
20341,2,6,30.0,3,3783,2.855670,20343,1.0201,4400.0,"Dice Rolling, Grid Movement, Race, Roll / Spin...",Children's Games


In [7]:
df.to_csv("../data/cleaned_data_set_v2.csv",sep=",",index=False)