In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from category_encoders.target_encoder import TargetEncoder
from mango.tuner import Tuner
from scipy.stats import uniform

In [5]:

df = pd.read_csv("2023-2024.csv") 
df.columns = df.columns.str.strip()


In [6]:
num_features = ['POINTS_5GAME_AVG', 'MINUTES_5GAME_AVG', 'REBOUNDS_5GAME_AVG', 'ASSISTS_5GAME_AVG',
                'STEALS_5GAME_AVG', 'BLOCKS_5GAME_AVG', 'TEAM_E_OFF_RATING', 'TEAM_E_DEF_RATING', 
                'TEAM_E_PACE', 'OPP_E_OFF_RATING', 'OPP_E_DEF_RATING', 'OPP_E_PACE']

cat_features = ['PLAYER_NAME', 'OPP_TEAM', 'COURT']

In [7]:
y = df['POINTS']  

X = df[num_features + cat_features]

In [8]:
assert all(col in df.columns for col in num_features + cat_features), "Some columns are missing!"

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', TargetEncoder(), cat_features)  
    ]
)

xgb_model = XGBRegressor(random_state=42)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

In [10]:
df.columns

Index(['PLAYER_NAME', 'MIN', 'POINTS', 'REBOUNDS', 'ASSISTS', 'STEALS',
       'BLOCKS', 'USG_PCT', 'MINUTES_5GAME_AVG', 'POINTS_5GAME_AVG',
       'REBOUNDS_5GAME_AVG', 'ASSISTS_5GAME_AVG', 'STEALS_5GAME_AVG',
       'BLOCKS_5GAME_AVG', 'OPP_TEAM', 'COURT', 'BTB', 'TEAM_E_OFF_RATING',
       'TEAM_E_DEF_RATING', 'TEAM_E_PACE', 'OPP_E_OFF_RATING',
       'OPP_E_DEF_RATING', 'OPP_E_PACE'],
      dtype='object')

In [11]:
param_grid = {
    "regressor__n_estimators": [50, 100, 200, 500, 1000],
    "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "regressor__max_depth": [3, 6, 7, 10, 15, 20],
    "regressor__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "regressor__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "regressor__gamma": [0, 0.05, 0.1, 0.2, 0.3],
    "regressor__min_child_weight": [1, 2, 5, 10, 15],
    "regressor__max_leaves": [31, 63, 127, 255],
    "regressor__scale_pos_weight": [1, 5, 10, 100],
    "regressor__reg_lambda": [0, 0.1, 1],
    "regressor__reg_alpha": [0, 0.1, 1],
    "regressor__tree_method": ['auto', 'hist', 'gpu_hist'],
    "regressor__booster": ['gbtree', 'dart'],
    "regressor__n_jobs": [-1]
}


grid_search = GridSearchCV(estimator=model_pipeline, 
                           param_grid=param_grid, 
                           cv=3, 
                           scoring="neg_mean_squared_error", 
                           verbose=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81000000 candidates, totalling 243000000 fits


In [110]:
grid_search.best_params_

{'regressor__colsample_bytree': 1.0,
 'regressor__gamma': 0,
 'regressor__learning_rate': 0.01,
 'regressor__max_depth': 3,
 'regressor__n_estimators': 500,
 'regressor__subsample': 0.8}

In [111]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [112]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 28.2972
R-squared: 0.6571
