In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib

In [7]:
df = pd.read_csv('../../data/training_data.csv',sep=',')
df.drop(columns=['Complexity Average','Owned Users'],inplace=True)
df

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Rating Average,Abstract Games,Children's Games,Customizable Games,Family Games,...,Wargames,Social Interaction,Strategy and Planning,Conflict Resolution,Exploration,Randomness and Luck,Resource Management,Puzzle Solving,Miscellaneous / Other,Collaboration
0,1,4,120.000000,14,42055,8.792440,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
1,2,4,60.000000,13,41643,8.612780,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
2,2,4,120.000000,14,19217,8.663370,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,5,120.000000,12,64864,8.432540,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
4,3,6,431.858995,14,13468,8.453179,0,0,0,0,...,0,1,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20315,2,2,30.000000,4,1340,2.278560,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
20316,2,99,60.000000,5,2154,2.853310,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
20317,2,4,30.000000,3,4006,3.177920,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20318,2,6,30.000000,3,3783,2.855670,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
X = df.drop(columns=['Rating Average'])
y = df['Rating Average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_gbr = Pipeline([
    ('scaler', StandardScaler()), 
    ('gbr', GradientBoostingRegressor(random_state=42))  
])

In [28]:
X

Unnamed: 0,Min Players,Max Players,Play Time,Min Age,Users Rated,Abstract Games,Children's Games,Customizable Games,Family Games,Party Games,...,Wargames,Social Interaction,Strategy and Planning,Conflict Resolution,Exploration,Randomness and Luck,Resource Management,Puzzle Solving,Miscellaneous / Other,Collaboration
0,1,4,120.000000,14,42055,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
1,2,4,60.000000,13,41643,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
2,2,4,120.000000,14,19217,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,5,120.000000,12,64864,0,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
4,3,6,431.858995,14,13468,0,0,0,0,0,...,0,1,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20315,2,2,30.000000,4,1340,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20316,2,99,60.000000,5,2154,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
20317,2,4,30.000000,3,4006,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20318,2,6,30.000000,3,3783,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
param_grid_gbr = {
    'gbr__n_estimators': [100, 200, 300],  
    'gbr__learning_rate': [0.01, 0.1, 0.05],
    'gbr__max_depth': [3, 4, 5],  
    'gbr__subsample': [0.8, 1.0],  
    'gbr__min_samples_split': [2, 5, 10]  
}

grid_search = GridSearchCV(pipeline_gbr, param_grid_gbr, cv=5, n_jobs=1, scoring='neg_mean_squared_error', verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=0.8; total time=   0.6s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=0.8; total time=   0.6s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=0.8; total time=   0.6s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=0.8; total time=   0.5s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=0.8; total time=   0.6s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=100, gbr__subsample=1.0; total time=   0.6s
[CV] END gbr__learning_rate=0.01, gbr__max_depth=3, gbr__min_samples_split=2, gbr__n_estimators=1

KeyboardInterrupt: 

In [1]:
y_pred = grid_search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

NameError: name 'grid_search' is not defined

In [8]:
joblib.dump(grid_search.best_estimator_, '../../models/rating_gradient.pkl')

['../../models/rating_gradient.pkl']