<a href="https://colab.research.google.com/github/ConorD28/NBA-Research/blob/main/NBA_Finals_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import math
df = pd.read_csv('Finals_Stats.csv')
inputs = df.iloc[:, :-26]
playoffs = df.iloc[:, -26:]

print(inputs.isnull().sum().sum()) #Check if there are NA values
print(playoffs.isnull().sum().sum())

0
0




# **Correlation/Scores**

In [3]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  cols = []
  correlations = []
  #corS = 0
  if isinstance(target, np.ndarray):
    target = pd.Series(target)
  for col in dataset.columns:
      #print(dataset.loc[:,col])
      #print(col)
      corS = dataset.loc[:,col].corr(target, method='spearman') # 'kendall'
      corP = dataset.loc[:,col].corr(target)
      if (abs(corP) > threshold) or (abs(corS) > threshold):
        cor2 = max(abs(corP), abs(corS))
        data.append(dataset.loc[:,col]) #make list of columns that meet the threshold
        cols.append(col)
        correlations.append(cor2) #make list of correlations that meet the threshold
  if len(data) == 0:
     return pd.DataFrame()

  df = pd.DataFrame(data)
  df_len = len(df.columns)
  df.insert(df_len, 'corrs', correlations)
  df = df.sort_values(by=df.columns[-1], ascending=False, key = abs)
  df = df.transpose()
  df_corrs = df.iloc[-1:, :]
  df = df.drop(df.tail(1).index)
  return df, df_corrs

In [4]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:
def Scores(y, y_pred):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)

  range_y = y.max() - y.min()
  Normalized_RMSE = (np.sqrt(MSE)/abs(range_y))
  Normalized_MAE = (MAE/abs(range_y))
  #print(f'Normalized RMSE:{ Normalized_RMSE:.2f}')
  #print(f'Normalized MAE:{ Normalized_MAE:.2f}')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Normalized_RMSE, Normalized_MAE

In [6]:
def Predict_Scores(model, X_tr, X_te, y_tr, y_te, t_sc):#, predict_df):
  y_train_pred = model.predict(X_tr)
  #print(y_train_pred)

  if len(y_te) != 0:
    y_test_pred = model.predict(X_te)
  else:
    y_test_pred = pd.DataFrame()
  #print('y test values:')
  #print(y_test_pred)

  #print('after inverse transform, training off by:')
  y_train_pred_transformed = t_sc.inverse_transform(y_train_pred.reshape(-1, 1)) # Reshape y_train_pred
  y_train_pred_transformed = pd.Series(y_train_pred_transformed.flatten())
  y_tr_transformed = t_sc.inverse_transform(y_tr.values.reshape(-1, 1))
  y_tr_transformed = pd.Series(y_tr_transformed.flatten())
  inv_error_tr_transformed = np.abs(y_tr_transformed - y_train_pred_transformed)

  #print('Training Scores:')
  NRMSE_tr, NMAE_tr = Scores(y_tr_transformed, y_train_pred_transformed)

  #print('y training values:')
  #print(y_train_pred_transformed)

  inv_error_test_transformed = 0
  y_te_transformed = 0
  y_test_pred_transformed = 0
  #Test Predictions:
  if len(y_te) != 0:
    y_te_transformed = t_sc.inverse_transform(y_te.reshape(-1, 1))
    y_te_transformed = y_te_transformed.flatten()
    y_test_pred_transformed = t_sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test_pred_transformed = y_test_pred_transformed.flatten()
    inv_error_test_transformed = np.abs(y_te_transformed - y_test_pred_transformed)
    #print(y_test_pred_transformed)

  #Predict:
  #predictions = model.predict(predict_df)

  return NRMSE_tr, NMAE_tr, inv_error_tr_transformed, inv_error_test_transformed, y_test_pred_transformed, y_train_pred_transformed#, predictions

# **ML Tuning Algorithms**

In [None]:
!pip install optuna
import optuna
!pip install joblib
import joblib
from sklearn.model_selection import cross_val_score

In [24]:
def Ridge_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
    def objective(trial, cv_runs, X_train, y_train):
      alpha = trial.suggest_float("alpha", 1, 25, log=True)#1e-4, 10.0; Alpha is the regularization strength
      solver = trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

      model = Ridge(alpha=alpha, solver=solver, random_state=28)
      score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
      return -score  # Minimize the MSE

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

    print("Best trial:")
    trial = study.best_trial
    print(f"  Params: {trial.params}")

    best_model = Ridge(**trial.params, random_state=28)
    best_model.fit(X_train, y_train)
    joblib.dump(best_model, 'PPG_Ridge_Finals.pkl')
    return best_model

In [9]:
def Lasso_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Lasso_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 1e-2, 10, log=True)##1e-4, 10.0;
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = Lasso(alpha=alpha, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Lasso_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = Lasso(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'oPPG_Lasso.pkl')
  return best_model

In [10]:
def Elastic_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Elastic_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 1e-2, 10, log=True)#1e-4, 10.0; Regularization strength
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)  # Mixing ratio between Lasso and Ridge
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Elastic_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = ElasticNet(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'EST DEF RTG_Elastic.pkl')
  return best_model

In [11]:
def RF_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def RF_objective(trial, cv_runs, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 45, 100)#300)
    max_depth = trial.suggest_int('max_depth', 5, 45) #50
    min_samples_split = trial.suggest_int('min_samples_split', 10, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        max_features=max_features, random_state=28, n_jobs=-1)

    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_root_mean_squared_error', n_jobs=-1)
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: RF_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = RandomForestRegressor(
      n_estimators=study.best_params['n_estimators'], max_depth=study.best_params['max_depth'],
      min_samples_split=study.best_params['min_samples_split'], min_samples_leaf=study.best_params['min_samples_leaf'],
      max_features=study.best_params['max_features'], random_state=28, n_jobs=-1)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'EST OFF RTG_RF.pkl')
  return best_model

# **ML Algorithms**

In [14]:
def RLE_Model(xTrain, xTest, yTrain, yTest, choice, predict_df, tar_sca): #Function to run Ridge, Lasso, or ElasticNet model
  if(choice=="Ridge"):
    pipeline = Ridge_tune(xTrain, yTrain, 10, 1000, 20)

  if(choice=="Lasso"):
    pipeline = Lasso_tune(xTrain, yTrain, 10, 150, 10)

  if(choice=="Elastic"):
    pipeline = Elastic_tune(xTrain, yTrain, 10, 300, 10)

  modelResults = Predict_Scores(pipeline, xTrain, xTest, yTrain, yTest, tar_sca)
  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')
  return modelResults

In [15]:
from sklearn.ensemble import RandomForestRegressor
def RF_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = RF_tune(xTrain, yTrain, 5, 200, 10)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

# **Inputs/LOOCV Function**

In [17]:
from joblib import Parallel, delayed
def corr_matrix_reduce(x_train, x_test):
  def compute_corr_row(i, data):
      return [data.iloc[:, i].corr(data.iloc[:, j]) for j in range(data.shape[1])]

  correlation_matrix = Parallel(n_jobs=-1)(
      delayed(compute_corr_row)(i, x_train) for i in range(x_train.shape[1])
  )

  correlation_matrix = pd.DataFrame(correlation_matrix, columns=x_train.columns, index=x_train.columns)

  # Step 2: Reduce features based on correlation threshold
  def reduce_features(corr_matrix, threshold=0.9):
    #Reduce features by removing one feature from any pair with a correlation above the threshold.
      to_drop = set()
      for i in range(corr_matrix.shape[0]):
          for j in range(i + 1, corr_matrix.shape[1]):
              if abs(corr_matrix.iloc[i, j]) > threshold:
                  # Add the second feature to the drop list
                  to_drop.add(corr_matrix.columns[j])
      return to_drop

  threshold = 0.9
  features_to_drop = reduce_features(correlation_matrix, threshold)

  # Drop the features from the original dataset
  x_train_reduced = x_train.drop(columns=features_to_drop)
  if x_test.empty != True:
    x_test = x_test.drop(columns=features_to_drop)

  # Step 3: Print results
  print("Original features:", x_train.shape[1])
  print("Features to drop:", len(features_to_drop))
  print("Reduced features:", x_train_reduced.shape[1])
  return x_train_reduced, x_test

In [18]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
def reduce_df(x_tr, x_te, y_tr, reduction_choice, if_final):
  if reduction_choice == "PLS":
    pls = PLSRegression(n_components=3)
    X_tr_pls = pls.fit_transform(x_tr, y_tr)[0]  #Extract transformed features
    if x_te.empty:
      X_te_pls = pd.DataFrame(columns=["PLS1", "PLS2", "PLS3"])  #Create empty DataFrame with correct columns if x_te is empty
    else:
      X_te_pls = pls.transform(x_te)
    X_tr_pls = pd.DataFrame(X_tr_pls, columns=["PLS1", "PLS2", "PLS3"])
    X_te_pls = pd.DataFrame(X_te_pls, columns=["PLS1", "PLS2", "PLS3"])
    #print("Explained variance in X:", np.round(pls.x_scores_.var(axis=0) / x_tr.var(axis=0).sum(), 3))
    #print("Explained variance in Y:", np.round(pls.y_scores_.var(axis=0) / y_tr.var(), 3))
    return X_tr_pls, X_te_pls, pls

  if reduction_choice == "PCA":
    pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
    X_tr_PCA = pca.fit_transform(x_tr)
    if if_final == "no":
      X_te_PCA = pca.transform(x_te)
    else:
      X_te_PCA = x_te
    #print("Principal axes:\n", pca.components_.tolist())
    #print("Explained variance:\n", pca.explained_variance_.tolist())
    print("Mean:", pca.mean_)
    return X_tr_PCA, X_te_PCA, pca

In [19]:
def get_inputs(data_frame, y, tr_index, te_index, scaler_choice, thresh, if_final):
#Feature Importance:
  if scaler_choice == "MMS":
    scaler = MinMaxScaler()
    scaler2 = MinMaxScaler()
  else:
    scaler = StandardScaler()
    scaler2 = StandardScaler()

  if if_final == 'yes':
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame, pd.DataFrame(), y, pd.DataFrame()
  else:
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame.iloc[tr_index], data_frame.iloc[[te_index]], y.iloc[tr_index], y.iloc[te_index]

  train_scaled = pd.DataFrame(scaler.fit_transform(data_scaled_train), columns = data_frame.columns)
  # SOS_train_scaled = train_scaled.mul(train_scaled["SOS_Use_"], axis=0)
  # SOS_train_scaled = SOS_train_scaled.add_suffix('tim_SOS_Use')
  # train_scaled = pd.concat([train_scaled, SOS_train_scaled], axis=1)
  y_train = pd.Series(scaler2.fit_transform(y_train.values.reshape(-1, 1)).flatten())
  train_scaled_correlated, correlations_df = correlation(train_scaled, thresh, y_train) #
  train_scaled_correlated = pd.DataFrame(train_scaled_correlated)

  # prev_train_scaled_correlated = train_scaled_correlated
  # most_corr = correlations_df.columns[0]
  # second_most_corr = correlations_df.columns[1]
  # most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[most_corr], axis=0)
  # most_corr_train = most_corr_train.add_suffix("*")
  # most_corr_train = most_corr_train.add_suffix(most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, most_corr_train], axis=1)
  train_scaled_correlated, correlations_df = correlation(train_scaled_correlated, thresh, y_train)

  if if_final == 'no':
    y_test = pd.Series(y_test)
    y_test = y_test.values.reshape(-1, 1)
    y_test = scaler2.transform(y_test).flatten()
    test_scaled = pd.DataFrame(scaler.transform(data_scaled_test), columns=data_frame.columns)
    # SOS_test_scaled = test_scaled.mul(test_scaled["SOS_Use_"], axis=0)
    # SOS_test_scaled = SOS_test_scaled.add_suffix('tim_SOS_Use')
    # test_scaled = pd.concat([test_scaled, SOS_test_scaled], axis=1)

    # prev_test_scaled = test_scaled
    # most_corr_test = prev_test_scaled.mul(test_scaled[most_corr], axis=0)
    # most_corr_test = most_corr_test.add_suffix("*")
    # most_corr_test = most_corr_test.add_suffix(most_corr)
    # test_scaled = pd.concat([test_scaled, most_corr_test], axis=1)
    test_scaled_correlated = test_scaled.loc[:, train_scaled_correlated.columns] #Test data with only correlated inputs
  else:
    test_scaled_correlated = data_scaled_test

  train_scaled_correlated, test_scaled_correlated = corr_matrix_reduce(train_scaled_correlated, test_scaled_correlated)
  correlations_df2 = correlations_df.loc[:, train_scaled_correlated.columns]

  return train_scaled_correlated, test_scaled_correlated, scaler, scaler2, y_train, y_test, correlations_df#,correlations_df2

In [20]:
def reduce_and_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, reduction_choice, scaler_target, is_final_model, model_choice):
  if reduction_choice == 'PLS':
    X_tr_reduced, X_te_reduced, PLS_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PLS", is_final_model)
  elif reduction_choice == 'PCA':
    X_tr_reduced, X_te_reduced, PCA_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PCA", is_final_model)

  if model_choice == 'Ridge':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Ridge", X_te_reduced, scaler_target)
  elif model_choice == 'Lasso':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Lasso", X_te_reduced, scaler_target)
  elif model_choice == 'Elastic':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Elastic", X_te_reduced, scaler_target)
  elif model_choice == 'RF':
    model = RF_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)
  elif model_choice == 'XG':
    if not isinstance(X_tr_reduced, pd.DataFrame):
        X_tr_reduced = pd.DataFrame(X_tr_reduced)
    if not isinstance(X_te_reduced, pd.DataFrame):
        X_te_reduced = pd.DataFrame(X_te_reduced)
    X_tr_reduced.columns = X_tr_reduced.columns.astype(str).str.replace(r'[\[\]<]', 'under', regex=True)
    X_te_reduced.columns = X_te_reduced.columns.astype(str).str.replace(r'[\[\]<]', 'under', regex=True)
    model = XG_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)

  return model

In [21]:
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneOut
# Define the function that processes each fold of LOO-CV and can make final model
def process_fold(train_index, test_index, X, y, reduce_choice, corr_thresh, scaling_choice, modeling_choice):
  if scaling_choice == "MMS":
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", corr_thresh, 'no')
  else:
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", corr_thresh, 'no')

  common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
  if not common_columns:
      # Handle the case where there are no correlated columns
      return None
  X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
  X_train_with_corrs = X_train_with_corrs.transpose()
  X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
  X_train_with_corrs = X_train_with_corrs.head(100)
  X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
  X_train_reduced = X_train_reduced.transpose()
  X_train_reduced.reset_index(drop = True, inplace = True)
  X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)

  model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, reduce_choice, scalerY, 'no', modeling_choice)

  return model #Return the model for each fold

# **Test 1 Fold**

In [None]:
X = inputs
y = playoffs['PTS_Finals/GM']
len_df = len(X)
train_index = list(range(len_df-1))
test_index = list(range((len_df-1), len_df))
X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.46, 'no')
print(len(X_test.columns))
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(100)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)
X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)
print(len(X_test_reduced.columns))

# model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, 'none', scalerY, 'no', 'Ridge')
# print(model[3]) #test error

In [None]:
model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, 'none', scalerY, 'no', 'Ridge')
print(model[3])

# **Run LOO-CV**

*   Each scaler for corr, PLS; PCA
*   adjust tuning, only take linear corr columns, mult by top 3 most corr; run longer

In [23]:
X = inputs
y = playoffs['PTS_Finals/GM']
loo = LeaveOneOut()
results = Parallel(n_jobs=-1)(delayed(process_fold)(train_idx, test_idx, X, y, '', .46, 'MMS', 'Ridge') #
                              for train_idx, test_idx in loo.split(X))
train_NRMSE_scores = 0
train_NMAE_scores = 0
test_RMSE_num_error = 0
train_error = 0
test_error = 0
len_df = len(X)
y_test_preds = []
range_target = y.max() - y.min()
for i in range(len_df):
  train_NRMSE_scores = results[i][0] + train_NRMSE_scores
  train_NMAE_scores = results[i][1] + train_NMAE_scores
  train_error = results[i][2] + train_error #inv transformed
  test_error = results[i][3] + test_error #inv transformed
  test_RMSE_num_error = results[i][3]**2 + test_RMSE_num_error
  y_test_preds.append(results[i][4]) #inv transformed

test_MAE = test_error/len_df
test_RMSE = math.sqrt(test_RMSE_num_error[0]/len_df)
test_NRMSE = test_RMSE/range_target
test_NMAE = test_MAE/range_target
range_preds = max(y_test_preds) - min(y_test_preds)

print(f'AVG training Normalized RMSE: {(train_NRMSE_scores/len_df):.2f}')
print(f'AVG training Normalized MAE: {(train_NMAE_scores/len_df):.2f}')
print(f'AVG of avg inv transformed train error from folds: {(np.mean(train_error)/len_df):.1f}')
print(f'Test Normalized RMSE: {test_NRMSE:.2f}')
print(f'Test Normalized MAE: {test_NMAE.item():.2f}')
print(f'AVG inv transformed test error: {(test_error.item()/len_df):.1f}')
print(f'Range of predictions (inv transformed): {(range_preds.item()):.1f}') #make sure not predicting same value for all preds
print('Test Predictions (inv transformed):')
for value in y_test_preds:
    print(f'{value.item():.0f}')

AVG training Normalized RMSE: 0.09
AVG training Normalized MAE: 0.07
AVG of avg inv transformed train error from folds: 3.0
Test Normalized RMSE: 0.17
Test Normalized MAE: 0.13
AVG inv transformed test error: 5.5
Range of predictions (inv transformed): 20.7
Test Predictions (inv transformed):
109
107
101
102
101
102
105
110
106
104
107
98
101
106
102
100
98
98
97
102
95
93
96
96
93
91
91
93
89
92


In [None]:
print(f"{range_target:.1f}")
y_test_preds_flat = np.array([pred.item() for pred in y_test_preds])  # Flatten the list of arrays
y_flat = np.array([y_flattened.item() for y_flattened in y.values])
result = y_test_preds_flat - y_flat
print(result)

# **PPG_Finals**

**Ridge MMS, .46, alpha - 1:25, TPU, max of 1000 runs and 20 seconds**

*   AVG training Normalized RMSE: 0.09
*   AVG training Normalized MAE: 0.07
*   AVG of avg inv transformed train error from folds: 3.0
*   Test Normalized RMSE: 0.17
*   Test Normalized MAE: 0.13
*   AVG inv transformed test error: 5.5
*   Range of predictions (inv transformed): 20.7

**Ridge, .46, alpha - 7: 20, PCA, max of 10 seconds and 200 runs****
*   AVG training Normalized RMSE: 0.11
*   AVG training Normalized MAE: 0.09
*   AVG of avg inv transformed train error from folds: 3.6
*   Test Normalized RMSE: 0.17
*   Test Normalized MAE: 0.13
*   AVG inv transformed test error: 5.5
*   Range of predictions (inv transformed): 19.0

**Elastic, MMS, .46**

*   AVG training Normalized RMSE: 0.08
*   AVG training Normalized MAE: 0.06
*   AVG of avg inv transformed train error from folds: 2.6
*   Test Normalized RMSE: 0.18
*   Test Normalized MAE: 0.14
*   AVG inv transformed test error: 5.9
*   Range of predictions (inv transformed): 24.7

# **Final Model**

Ridge, .46, MMS, alpha - 1:25, TPU, max of 1000 runs and 20 seconds
*   Best trial: Params: {'alpha': 9.028239456268706, 'solver': 'sag'}
*   Normalized RMSE: 0.093
*   Normalized MAE: 0.075
*   avg inv transformed accuracy: 3.1
*   Range of predictions (inv transformed): 26.8

In [25]:
X = inputs
y = playoffs['PTS_Finals/GM']
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.46, 'yes')

common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(100)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

model = reduce_and_model(X_train_reduced, X_test, Y_train, Y_test, '', scalerY, 'yes', 'Ridge')

range_preds =  model[5].max() - model[5].min() #inv transformed
print(f'Normalized RMSE: {(model[0]):.3f}')
print(f'Normalized MAE: {(model[1]):.3f}')
print(f'avg inv transformed accuracy: {(np.mean(model[2])):.1f}')
print(f'Range of predictions (inv transformed): {(range_preds):.1f}') #make sure not predicting same value for all preds
print('inv transformed predictions:')
for value in model[5]:
    print(f'{value:.2f}')

  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
[I 2025-06-04 14:54:56,374] A new study created in memory with name: no-name-db4574d6-297f-498c-a9bd-fe7c48b99fbe
[I 2025-06-04 14:54:56,432] Trial 0 finished with value: 0.12635620955697266 and parameters: {'alpha': 7.361084358818593, 'solver': 'sag'}. Best is trial 0 with value: 0.12635620955697266.
[I 2025-06-04 14:54:56,476] Trial 1 finished with value: 0.12630004124082056 and parameters: {'alpha': 7.679485769896476, 'solver': 'svd'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,518] Trial 2 finished with value: 0.12753067110055014 and parameters: {'alpha': 4.895475044037765, 'solver': 'svd'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,561] Trial 3 finished with value: 0.126982672601403 and parameters: {'alpha': 5.669205991345835, 'solver': 'svd'}. Best is trial 1 with value: 0.12630004124082056.


Original features: 449
Features to drop: 318
Reduced features: 131


[I 2025-06-04 14:54:56,604] Trial 4 finished with value: 0.13595148596365497 and parameters: {'alpha': 1.3508775942285935, 'solver': 'svd'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,667] Trial 5 finished with value: 0.13247951217742307 and parameters: {'alpha': 2.159184696844063, 'solver': 'saga'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,709] Trial 6 finished with value: 0.1267466972209096 and parameters: {'alpha': 6.130058446059465, 'solver': 'sparse_cg'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,752] Trial 7 finished with value: 0.12678436296719758 and parameters: {'alpha': 6.0484481359552955, 'solver': 'cholesky'}. Best is trial 1 with value: 0.12630004124082056.
[I 2025-06-04 14:54:56,800] Trial 8 finished with value: 0.12628039121916093 and parameters: {'alpha': 10.38436657274186, 'solver': 'sag'}. Best is trial 8 with value: 0.12628039121916093.
[I 2025-06-04 14:54:56,845] Trial 9 finish

Best trial:
  Params: {'alpha': 9.028239456268706, 'solver': 'sag'}
Normalized RMSE: 0.093
Normalized MAE: 0.075
avg inv transformed accuracy: 3.1
Range of predictions (inv transformed): 26.8
inv transformed predictions:
105.77
103.11
104.57
99.33
102.99
101.53
107.76
108.76
110.63
103.61
113.16
105.77
100.84
103.33
101.68
97.69
102.00
95.15
96.51
99.96
94.24
91.14
93.16
91.48
97.00
90.16
95.65
93.00
87.68
86.37


In [None]:
X = inputs
y = playoffs['PTS_Finals/GM']
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.46, 'yes')

common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(100)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [None]:
X_train_reduced.to_csv('PPG_Finals_Inputs.csv')

In [None]:
# import scipy.stats
# X = inputs
# y = playoff_stats['Pts/GM_Playoffs']#Pts/GM_Playoffs, oPts/GM_Playoffs #
# col = ''
# print(X.loc[:, col].corr(y))
# print(scipy.stats.spearmanr(X.loc[:,col], y)[0])

# **PPG Prediction**

In [None]:
X = inputs
y = playoffs['PTS_Finals/GM']
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.46, 'yes')

common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(100)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [38]:
X = inputs
y = playoffs['PTS_Finals/GM']
PPG_predict = pd.read_csv('PPG_Finals_Inputs.csv')
PPG_predict.columns = PPG_predict.columns.str.replace('Â\xa0', '\xa0')
PPG_predict = PPG_predict.iloc[:2,:]

PPG_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
y_train = pd.Series(target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten())

trained_features_to_scale = X[PPG_predict.columns]
trained_features_scaled = pd.DataFrame(PPG_scaler.fit_transform(trained_features_to_scale), columns = trained_features_to_scale.columns)

PPG_predict_scaled = pd.DataFrame(PPG_scaler.transform(PPG_predict), columns = PPG_predict.columns)

In [39]:
import joblib
PPG_model = joblib.load('PPG_Ridge_Finals.pkl')
PPG_predict_scaled = PPG_predict_scaled[X_train_reduced.columns] #make sure order is correct and no extra columns
PPG_predictions = PPG_model.predict(PPG_predict_scaled)
target_scaler.inverse_transform(PPG_predictions.reshape(-1, 1))

array([[106.6995156 ],
       [106.42395715]])