In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
import xgboost as xgb
import pandas as pd
import joblib

In [2]:
df = pd.read_csv('../../data/training_data.csv', sep=',')
df.drop(columns=['Rating Average','Owned Users'], inplace=True)
df

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Complexity Average,Abstract Games,Children's Games,Customizable Games,Family Games,...,Wargames,Social Interaction,Strategy and Planning,Conflict Resolution,Exploration,Randomness and Luck,Resource Management,Puzzle Solving,Miscellaneous / Other,Collaboration
0,1,4,120.000000,14,42055,3.8604,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
1,2,4,60.000000,13,41643,2.8405,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
2,2,4,120.000000,14,19217,3.9129,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,5,120.000000,12,64864,3.2406,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
4,3,6,431.858995,14,13468,4.2219,0,0,0,0,...,0,1,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20315,2,2,30.000000,4,1340,1.0000,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
20316,2,99,60.000000,5,2154,1.0455,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
20317,2,4,30.000000,3,4006,1.0779,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20318,2,6,30.000000,3,3783,1.0201,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
X = df.drop(columns=['Complexity Average'])
y = df['Complexity Average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
rf = RandomForestRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# Define the ensemble model
ensemble_model = VotingRegressor([('rf', rf), ('gbr', gbr), ('xgb', xgb_model)])

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ensemble', ensemble_model)
])

In [4]:
param_space = {
    'ensemble__rf__n_estimators': (50, 200),
    'ensemble__rf__max_depth': (3, 20),
    'ensemble__gbr__n_estimators': (50, 200),
    'ensemble__gbr__learning_rate': (0.01, 0.1),
    'ensemble__gbr__max_depth': (3, 10),
    'ensemble__xgb__n_estimators': (50, 200),
    'ensemble__xgb__learning_rate': (0.01, 0.1),
    'ensemble__xgb__max_depth': (3, 10),
    'ensemble__xgb__subsample': (0.7, 1.0),
    'ensemble__xgb__colsample_bytree': (0.7, 1.0),
}

bayes_search = BayesSearchCV(
    pipeline,
    param_space,
    n_iter=100,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_squared_error'
)

bayes_search.fit(X_train, y_train)

best_params = bayes_search.best_params_
print(f"Best hyperparameters: {best_params}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [5]:
y_pred = bayes_search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Squared Error: 0.22435379208200648
Mean Absolute Error: 0.33050675614016767
R² Score: 0.6832146635226987


In [7]:
joblib.dump(bayes_search.best_estimator_, '../../models/complexity_voting.pkl')

['../../models/complexity_voting.pkl']