<a href="https://colab.research.google.com/github/ConorD28/NBA-Research/blob/main/NBA_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
df = pd.read_csv('NBA Upload.csv')
inputs = df.iloc[:, :-20]
playoff_stats = pd.read_csv('NBA_playoffs.csv')

print(inputs.isnull().sum().sum()) #Check if there are NA values
print(playoff_stats.isnull().sum().sum())



# **Correlation/Scores**

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  cols = []
  correlations = []
  #corS = 0
  if isinstance(target, np.ndarray):
    target = pd.Series(target)
  for col in dataset.columns:
      #print(dataset.loc[:,col])
      #print(col)
      corS = dataset.loc[:,col].corr(target, method='spearman') # 'kendall'
      corP = dataset.loc[:,col].corr(target)
      if (abs(corP) > threshold) or (abs(corS) > threshold):
        cor2 = max(abs(corP), abs(corS))
        data.append(dataset.loc[:,col]) #make list of columns that meet the threshold
        cols.append(col)
        correlations.append(cor2) #make list of correlations that meet the threshold
  if len(data) == 0:
     return pd.DataFrame()

  df = pd.DataFrame(data)
  df_len = len(df.columns)
  df.insert(df_len, 'corrs', correlations)
  df = df.sort_values(by=df.columns[-1], ascending=False, key = abs)
  df = df.transpose()
  df_corrs = df.iloc[-1:, :]
  df = df.drop(df.tail(1).index)
  return df, df_corrs

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)

  range_y = y.max() - y.min()
  Normalized_RMSE = (np.sqrt(MSE)/abs(range_y))
  Normalized_MAE = (MAE/abs(range_y))
  #print(f'Normalized RMSE:{ Normalized_RMSE:.2f}')
  #print(f'Normalized MAE:{ Normalized_MAE:.2f}')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Normalized_RMSE, Normalized_MAE

In [None]:
def Predict_Scores(model, X_tr, X_te, y_tr, y_te, t_sc):#, predict_df):
  y_train_pred = model.predict(X_tr)
  #print(y_train_pred)

  if len(y_te) != 0:
    y_test_pred = model.predict(X_te)
  else:
    y_test_pred = pd.DataFrame()
  #print('y test values:')
  #print(y_test_pred)

  #print('after inverse transform, training off by:')
  y_train_pred_transformed = t_sc.inverse_transform(y_train_pred.reshape(-1, 1)) # Reshape y_train_pred
  y_train_pred_transformed = pd.Series(y_train_pred_transformed.flatten())
  y_tr_transformed = t_sc.inverse_transform(y_tr.values.reshape(-1, 1))
  y_tr_transformed = pd.Series(y_tr_transformed.flatten())
  inv_error_tr_transformed = np.abs(y_tr_transformed - y_train_pred_transformed)

  #print('Training Scores:')
  NRMSE_tr, NMAE_tr = Scores(y_tr_transformed, y_train_pred_transformed)

  #print('y training values:')
  #print(y_train_pred_transformed)

  inv_error_test_transformed = 0
  y_te_transformed = 0
  y_test_pred_transformed = 0
  #Test Predictions:
  if len(y_te) != 0:
    y_te_transformed = t_sc.inverse_transform(y_te.reshape(-1, 1))
    y_te_transformed = y_te_transformed.flatten()
    y_test_pred_transformed = t_sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test_pred_transformed = y_test_pred_transformed.flatten()
    inv_error_test_transformed = np.abs(y_te_transformed - y_test_pred_transformed)
    #print(y_test_pred_transformed)

  #Predict:
  #predictions = model.predict(predict_df)

  return NRMSE_tr, NMAE_tr, inv_error_tr_transformed, inv_error_test_transformed, y_test_pred_transformed, y_train_pred_transformed#, predictions

# **ML Tuning Algorithms**

In [None]:
!pip install optuna
import optuna
!pip install joblib
import joblib
from sklearn.model_selection import cross_val_score

In [None]:
def Ridge_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
    def objective(trial, cv_runs, X_train, y_train):
      alpha = trial.suggest_float("alpha", 5, 20, log=True)#1e-4, 10.0; Alpha is the regularization strength
      solver = trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

      model = Ridge(alpha=alpha, solver=solver, random_state=28)
      score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
      return -score  # Minimize the MSE

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

    print("Best trial:")
    trial = study.best_trial
    print(f"  Params: {trial.params}")

    best_model = Ridge(**trial.params, random_state=28)
    best_model.fit(X_train, y_train)
    #joblib.dump(best_model, 'PPG_Ridge.pkl')
    return best_model

In [None]:
def Lasso_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Lasso_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 1e-2, 10, log=True)##1e-4, 10.0;
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = Lasso(alpha=alpha, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Lasso_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = Lasso(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'oPPG_Lasso.pkl')
  return best_model

In [None]:
def Elastic_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Elastic_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 1e-2, 10, log=True)#1e-4, 10.0; Regularization strength
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)  # Mixing ratio between Lasso and Ridge
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_root_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Elastic_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = ElasticNet(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  joblib.dump(best_model, 'EST DEF RTG_Elastic.pkl')
  return best_model

In [None]:
def RF_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def RF_objective(trial, cv_runs, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 45, 100)#300)
    max_depth = trial.suggest_int('max_depth', 5, 45) #50
    min_samples_split = trial.suggest_int('min_samples_split', 10, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        max_features=max_features, random_state=28, n_jobs=-1)

    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_root_mean_squared_error', n_jobs=-1)
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: RF_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = RandomForestRegressor(
      n_estimators=study.best_params['n_estimators'], max_depth=study.best_params['max_depth'],
      min_samples_split=study.best_params['min_samples_split'], min_samples_leaf=study.best_params['min_samples_leaf'],
      max_features=study.best_params['max_features'], random_state=28, n_jobs=-1)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'EST OFF RTG_RF.pkl')
  return best_model

In [None]:
def XG_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def XG_objective(trial, cv_runs, X_train, y_train):
    params = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', .5, 10.0, log=True), #1e-3,10
        'alpha': trial.suggest_float('alpha', .5, 10.0, log=True), #1e-3,10
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 5), #10
        'eta': trial.suggest_float('eta', 1e-3, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 30, 100) #100,1000
    }

    model = xgb.XGBRegressor(**params, tree_method = 'hist')
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv_runs
  )
    return -np.mean(scores)

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: XG_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

  best_params = study.best_params
  best_params['objective'] = 'reg:squarederror'

  final_model = xgb.XGBRegressor(**best_params, tree_method = 'hist', random_state=28)
  final_model.fit(X_train, y_train)

  #joblib.dump(best_model, 'PPG_XG.pkl')
  return final_model

# **ML Algorithms**

In [None]:
def RLE_Model(xTrain, xTest, yTrain, yTest, choice, predict_df, tar_sca): #Function to run Ridge, Lasso, or ElasticNet model
  if(choice=="Ridge"):
    pipeline = Ridge_tune(xTrain, yTrain, 10, 200, 15)

  if(choice=="Lasso"):
    pipeline = Lasso_tune(xTrain, yTrain, 10, 200, 15)

  if(choice=="Elastic"):
    pipeline = Elastic_tune(xTrain, yTrain, 10, 2000, 20)

  modelResults = Predict_Scores(pipeline, xTrain, xTest, yTrain, yTest, tar_sca)
  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')
  return modelResults

In [None]:
from sklearn.ensemble import RandomForestRegressor
def RF_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = RF_tune(xTrain, yTrain, 5, 200, 10)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

In [None]:
import xgboost as xgb
def XG_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = XG_tune(xTrain, yTrain, 5, 500, 10)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

# **Inputs/LOOCV Function**

In [None]:
from joblib import Parallel, delayed
def corr_matrix_reduce(x_train, x_test):
  def compute_corr_row(i, data):
      return [data.iloc[:, i].corr(data.iloc[:, j]) for j in range(data.shape[1])]

  correlation_matrix = Parallel(n_jobs=-1)(
      delayed(compute_corr_row)(i, x_train) for i in range(x_train.shape[1])
  )

  correlation_matrix = pd.DataFrame(correlation_matrix, columns=x_train.columns, index=x_train.columns)

  # Step 2: Reduce features based on correlation threshold
  def reduce_features(corr_matrix, threshold=0.9):
    #Reduce features by removing one feature from any pair with a correlation above the threshold.
      to_drop = set()
      for i in range(corr_matrix.shape[0]):
          for j in range(i + 1, corr_matrix.shape[1]):
              if abs(corr_matrix.iloc[i, j]) > threshold:
                  # Add the second feature to the drop list
                  to_drop.add(corr_matrix.columns[j])
      return to_drop

  threshold = 0.9
  features_to_drop = reduce_features(correlation_matrix, threshold)

  # Drop the features from the original dataset
  x_train_reduced = x_train.drop(columns=features_to_drop)
  if x_test.empty != True:
    x_test = x_test.drop(columns=features_to_drop)

  # Step 3: Print results
  print("Original features:", x_train.shape[1])
  print("Features to drop:", len(features_to_drop))
  print("Reduced features:", x_train_reduced.shape[1])
  return x_train_reduced, x_test

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
def reduce_df(x_tr, x_te, y_tr, reduction_choice, if_final):
  if reduction_choice == "PLS":
    pls = PLSRegression(n_components=3)
    X_tr_pls = pls.fit_transform(x_tr, y_tr)[0]  #Extract transformed features
    if x_te.empty:
      X_te_pls = pd.DataFrame(columns=["PLS1", "PLS2", "PLS3"])  #Create empty DataFrame with correct columns if x_te is empty
    else:
      X_te_pls = pls.transform(x_te)
    X_tr_pls = pd.DataFrame(X_tr_pls, columns=["PLS1", "PLS2", "PLS3"])
    X_te_pls = pd.DataFrame(X_te_pls, columns=["PLS1", "PLS2", "PLS3"])
    #print("Explained variance in X:", np.round(pls.x_scores_.var(axis=0) / x_tr.var(axis=0).sum(), 3))
    #print("Explained variance in Y:", np.round(pls.y_scores_.var(axis=0) / y_tr.var(), 3))
    return X_tr_pls, X_te_pls, pls

  if reduction_choice == "PCA":
    pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
    X_tr_PCA = pca.fit_transform(x_tr)
    if if_final == "no":
      X_te_PCA = pca.transform(x_te)
    else:
      X_te_PCA = x_te
    #print("Principal axes:\n", pca.components_.tolist())
    #print("Explained variance:\n", pca.explained_variance_.tolist())
    print("Mean:", pca.mean_)
    return X_tr_PCA, X_te_PCA, pca

In [None]:
def get_inputs(data_frame, y, tr_index, te_index, scaler_choice, thresh, if_final):
#Feature Importance:
  if scaler_choice == "MMS":
    scaler = MinMaxScaler()
    scaler2 = MinMaxScaler()
  else:
    scaler = StandardScaler()
    scaler2 = StandardScaler()

  if if_final == 'yes':
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame, pd.DataFrame(), y, pd.DataFrame()
  else:
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame.iloc[tr_index], data_frame.iloc[[te_index]], y.iloc[tr_index], y.iloc[te_index]

  train_scaled = pd.DataFrame(scaler.fit_transform(data_scaled_train), columns = data_frame.columns)
  SOS_train_scaled = train_scaled.mul(train_scaled["SOS_Use"], axis=0)
  SOS_train_scaled = SOS_train_scaled.add_suffix('tim_SOS_Use')
  train_scaled = pd.concat([train_scaled, SOS_train_scaled], axis=1)
  y_train = pd.Series(scaler2.fit_transform(y_train.values.reshape(-1, 1)).flatten())
  train_scaled_correlated, correlations_df = correlation(train_scaled, thresh, y_train) #
  train_scaled_correlated = pd.DataFrame(train_scaled_correlated)

  # prev_train_scaled_correlated = train_scaled_correlated
  # most_corr = correlations_df.columns[0]
  # second_most_corr = correlations_df.columns[1]
  # most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[most_corr], axis=0)
  # most_corr_train = most_corr_train.add_suffix("*")
  # most_corr_train = most_corr_train.add_suffix(most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, most_corr_train], axis=1)

  # second_most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[second_most_corr], axis=0)
  # second_most_corr_train = second_most_corr_train.add_suffix("*")
  # second_most_corr_train = second_most_corr_train.add_suffix(second_most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, second_most_corr_train], axis=1)
  train_scaled_correlated, correlations_df = correlation(train_scaled_correlated, thresh, y_train)

  if if_final == 'no':
    y_test = pd.Series(y_test)
    y_test = y_test.values.reshape(-1, 1)
    y_test = scaler2.transform(y_test).flatten()
    test_scaled = pd.DataFrame(scaler.transform(data_scaled_test), columns=data_frame.columns)
    SOS_test_scaled = test_scaled.mul(test_scaled["SOS_Use"], axis=0)
    SOS_test_scaled = SOS_test_scaled.add_suffix('tim_SOS_Use')
    test_scaled = pd.concat([test_scaled, SOS_test_scaled], axis=1)

    # prev_test_scaled = test_scaled
    # most_corr_test = prev_test_scaled.mul(test_scaled[most_corr], axis=0)
    # most_corr_test = most_corr_test.add_suffix("*")
    # most_corr_test = most_corr_test.add_suffix(most_corr)
    # test_scaled = pd.concat([test_scaled, most_corr_test], axis=1)

    # second_most_corr_test = prev_test_scaled.mul(test_scaled[second_most_corr], axis=0)
    # second_most_corr_test = second_most_corr_test.add_suffix("*")
    # second_most_corr_test = second_most_corr_test.add_suffix(second_most_corr)
    # test_scaled = pd.concat([test_scaled, second_most_corr_test], axis=1)

    test_scaled_correlated = test_scaled.loc[:, train_scaled_correlated.columns] #Test data with only correlated inputs
  else:
    test_scaled_correlated = data_scaled_test

  train_scaled_correlated, test_scaled_correlated = corr_matrix_reduce(train_scaled_correlated, test_scaled_correlated)
  correlations_df2 = correlations_df.loc[:, train_scaled_correlated.columns]

  return train_scaled_correlated, test_scaled_correlated, scaler, scaler2, y_train, y_test, correlations_df#,correlations_df2

In [None]:
def reduce_and_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, reduction_choice, scaler_target, is_final_model, model_choice):
  if reduction_choice == 'PLS':
    X_tr_reduced, X_te_reduced, PLS_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PLS", is_final_model)
  elif reduction_choice == 'PCA':
    X_tr_reduced, X_te_reduced, PCA_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PCA", is_final_model)

  if model_choice == 'Ridge':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Ridge", X_te_reduced, scaler_target)
  elif model_choice == 'Lasso':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Lasso", X_te_reduced, scaler_target)
  elif model_choice == 'Elastic':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Elastic", X_te_reduced, scaler_target)
  elif model_choice == 'RF':
    model = RF_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)
  elif model_choice == 'XG':
    if not isinstance(X_tr_reduced, pd.DataFrame):
        X_tr_reduced = pd.DataFrame(X_tr_reduced)
    if not isinstance(X_te_reduced, pd.DataFrame):
        X_te_reduced = pd.DataFrame(X_te_reduced)
    X_tr_reduced.columns = X_tr_reduced.columns.astype(str).str.replace(r'[\[\]<]', 'under', regex=True)
    X_te_reduced.columns = X_te_reduced.columns.astype(str).str.replace(r'[\[\]<]', 'under', regex=True)
    model = XG_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)

  return model

In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneOut
# Define the function that processes each fold of LOO-CV and can make final model
def process_fold(train_index, test_index, X, y, reduce_choice, corr_thresh, scaling_choice, modeling_choice):
  if scaling_choice == "MMS":
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", corr_thresh, 'no')
  else:
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", corr_thresh, 'no')

  common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
  if not common_columns:
      # Handle the case where there are no correlated columns
      return None
  X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
  X_train_with_corrs = X_train_with_corrs.transpose()
  X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
  X_train_with_corrs = X_train_with_corrs.head(30)
  X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
  X_train_reduced = X_train_reduced.transpose()
  X_train_reduced.reset_index(drop = True, inplace = True)
  X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)

  model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, reduce_choice, scalerY, 'no', modeling_choice)

  return model #Return the model for each fold

# **Test 1 Fold**

In [None]:
X = inputs
y = playoff_stats.iloc[:, -3:-2] #playoff_stats['EST. OFFRTG_Playoff']
len_df = len(X)
train_index = list(range(len_df-1))
test_index = list(range((len_df-1), len_df))
X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.53, 'no')
print(len(X_test.columns))
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(30)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)
X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)
print(len(X_test_reduced.columns))

#model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, 'none', scalerY, 'no', 'Ridge')
#print(model[4]) #test error

# **Run LOO-CV**

*   Each scaler for corr, PLS; PCA
*   adjust tuning, only take linear corr columns, mult by top 3 most corr; run longer

In [None]:
X = inputs
y = playoff_stats['EST. OFFRTG_Playoff']
loo = LeaveOneOut()
results = Parallel(n_jobs=-1)(delayed(process_fold)(train_idx, test_idx, X, y, 'PCA', .53, 'MMS', 'Elastic') #
                              for train_idx, test_idx in loo.split(X))
train_NRMSE_scores = 0
train_NMAE_scores = 0
test_RMSE_num_error = 0
train_error = 0
test_error = 0
len_df = len(X)
y_test_preds = []
range_target = y.max() - y.min()

for i in range(len_df):
  train_NRMSE_scores = results[i][0] + train_NRMSE_scores
  train_NMAE_scores = results[i][1] + train_NMAE_scores
  train_error = results[i][2] + train_error #inv transformed
  test_error = results[i][3] + test_error #inv transformed
  test_RMSE_num_error = results[i][3]**2 + test_RMSE_num_error
  y_test_preds.append(results[i][4]) #inv transformed

test_MAE = test_error/len_df
test_RMSE = math.sqrt(test_RMSE_num_error[0]/len_df)
test_NRMSE = test_RMSE/range_target
test_NMAE = test_MAE/range_target
range_preds = max(y_test_preds) - min(y_test_preds)

print(f'AVG training Normalized RMSE: {(train_NRMSE_scores/len_df):.2f}')
print(f'AVG training Normalized MAE: {(train_NMAE_scores/len_df):.2f}')
print(f'AVG of avg inv transformed train error from folds: {(np.mean(train_error)/len_df):.1f}')
print(f'Test Normalized RMSE: {test_NRMSE:.2f}')
print(f'Test Normalized MAE: {test_NMAE.item():.2f}')
print(f'AVG inv transformed test error: {(test_error.item()/len_df):.1f}')
print(f'Range of predictions (inv transformed): {(range_preds.item()):.1f}') #make sure not predicting same value for all preds
print('Test Predictions (inv transformed):')
for value in y_test_preds:
    print(f'{value.item():.0f}')

In [None]:
print(f"{range_target:.1f}")
y_test_preds_flat = np.array([pred.item() for pred in y_test_preds])  # Flatten the list of arrays
y_flat = np.array([y_flattened.item() for y_flattened in y.values])
result = y_test_preds_flat - y_flat  # Remove .values since y_flat is already a NumPy array
print(result)

# **Final Model**

In [None]:
X = inputs
y = playoff_stats.iloc[:,-3:-2]
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.53, 'yes')

common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(30)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

model = reduce_and_model(X_train_reduced, X_test, Y_train, Y_test, 'PCA', scalerY, 'yes', 'Elastic')

range_preds =  model[5].max() - model[5].min() #inv transformed
print(f'Normalized RMSE: {(model[0]):.3f}')
print(f'Normalized MAE: {(model[1]):.3f}')
print(f'avg inv transformed accuracy: {(np.mean(model[2])):.1f}')
print(f'Range of predictions (inv transformed): {(range_preds):.1f}') #make sure not predicting same value for all preds
print('inv transformed predictions:')
for value in model[5]:
    print(f'{value:.2f}')

In [None]:
X_train_reduced.to_csv('EST OFF RTG Inputs.csv')

In [None]:
# import scipy.stats
# X = inputs
# y = playoff_stats['Pts/GM_Playoffs']#Pts/GM_Playoffs, oPts/GM_Playoffs #
# col = ''
# print(X.loc[:, col].corr(y))
# print(scipy.stats.spearmanr(X.loc[:,col], y)[0])

# **EST OFF RTG**

**Elastic**
*  AVG training Normalized RMSE: 0.16
*  AVG training Normalized MAE: 0.13
*  AVG of avg inv transformed train error from folds: 2.3
*  Test Normalized RMSE: 0.20
*  Test Normalized MAE: 0.16
*  AVG inv transformed test error: 3.0
*  Range of predictions (inv transformed): 11.0

**Elastic Final**
*  Best trial:
  Params: {'alpha': 0.01362602124633981, 'l1_ratio': 0.9805500719803727, 'max_iter': 2600, 'tol': 5.89836448209172e-05}
*  Normalized RMSE: 0.156
*  Normalized MAE: 0.125
*  avg inv transformed accuracy: 2.3
*  Range of predictions (inv transformed): 11.1

In [None]:
X = inputs
y = playoff_stats['EST. OFFRTG_Playoff']#
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scaleroPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.53, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(30)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [None]:
EST_OFF_RTG_predict = pd.read_csv('EST OFF RTG Inputs_ATL_MEM.csv')
EST_OFF_RTG_predict = EST_OFF_RTG_predict[0:16]
EST_OFF_RTG_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
EST_OFF_RTG_predict['EST.\xa0OFFRTG_Best Player'] = EST_OFF_RTG_predict.iloc[:, 1:2]
EST_OFF_RTG_predict.drop(EST_OFF_RTG_predict.iloc[:, 1:2], axis = 1, inplace = True)
trained_features_to_scale = X[EST_OFF_RTG_predict.columns]
trained_features_scaled = pd.DataFrame(EST_OFF_RTG_scaler.fit_transform(trained_features_to_scale), columns = trained_features_to_scale.columns)

EST_OFF_RTG_predict_scaled = pd.DataFrame(EST_OFF_RTG_scaler.transform(EST_OFF_RTG_predict), columns = EST_OFF_RTG_predict.columns)
EST_OFF_RTG_playoffs_scaled = pd.Series(target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten())

In [None]:
EST_OFF_RTG_predict_scaled['2P%_48.1tim_SOS_Use'] = EST_OFF_RTG_predict_scaled['2P%_48.1'] * EST_OFF_RTG_predict_scaled['SOS_Use']
EST_OFF_RTG_predict_scaled['oFG%_<5_FT.tim_SOS_Use'] = EST_OFF_RTG_predict_scaled['oFG%_<5_FT.'] * EST_OFF_RTG_predict_scaled['SOS_Use']
EST_OFF_RTG_predict_scaled['3P%_Best Player Clutch_100tim_SOS_Use'] = EST_OFF_RTG_predict_scaled['3P%_Best Player Clutch_100'] * EST_OFF_RTG_predict_scaled['SOS_Use']
EST_OFF_RTG_predict_scaled['3PM_Best Player Clutch_100tim_SOS_Use'] = EST_OFF_RTG_predict_scaled['3PM_Best Player Clutch_100'] * EST_OFF_RTG_predict_scaled['SOS_Use']
EST_OFF_RTG_predict_scaled['FG%_25-29 FT. Best Playertim_SOS_Use'] = EST_OFF_RTG_predict_scaled['FG%_25-29 FT. Best Player'] * EST_OFF_RTG_predict_scaled['SOS_Use']

EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('SOS_Use', axis = 1)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('2P%_48.1', axis = 1)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('3PM_Best Player Clutch_100', axis = 1)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('3P%_Best Player Clutch_100', axis = 1)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('oFG%_<5_FT.', axis = 1)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled.drop('FG%_25-29 FT. Best Player', axis = 1)

In [None]:
EST_OFF_RTG_model = joblib.load('EST OFF RTG_Elastic.pkl')

pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
X_train_PCA = pca.fit_transform(X_train_reduced)
EST_OFF_RTG_predict_scaled = EST_OFF_RTG_predict_scaled[X_train_reduced.columns]
EST_OFF_RTG_predict_scaled_PCA = pca.transform(EST_OFF_RTG_predict_scaled)

EST_OFF_RTG_predictions = EST_OFF_RTG_model.predict(EST_OFF_RTG_predict_scaled_PCA)
EST_OFF_RTG_predictions = target_scaler.inverse_transform(EST_OFF_RTG_predictions.reshape(-1, 1))
EST_OFF_RTG_predictions = pd.Series(EST_OFF_RTG_predictions.flatten())
EST_OFF_RTG_predictions.to_csv('EST_OFF_RTG_preds.csv')
EST_OFF_RTG_predictions

Unnamed: 0,0
0,113.563745
1,114.045235
2,114.276236
3,113.822727
4,112.548964
5,112.886059
6,112.206082
7,113.71005
8,112.63929
9,112.703126


# **EST DEF RTG**

**Elastic**
*  AVG training Normalized RMSE: 0.16
*  AVG training Normalized MAE: 0.13
*  AVG of avg inv transformed train error from folds: 1.7
*  Test Normalized RMSE: 0.22
*  Test Normalized MAE: 0.19
*  AVG inv transformed test error: 2.5
*  Range of predictions (inv transformed): 11.7

**Elastic Final**
*  Best trial:
  Params: {'alpha': 0.10288713149502167, 'l1_ratio': 3.781147381940325e-05, 'max_iter': 5500, 'tol': 0.0028385766496962753}
*  Normalized RMSE: 0.162
*  Normalized MAE: 0.134
*  avg inv transformed accuracy: 1.8
*  Range of predictions (inv transformed): 11.7

In [None]:
X = inputs
y = playoff_stats.iloc[:, -3:-2]#
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scaleroPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.53, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(30)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [None]:
EST_DEF_RTG_predict = pd.read_csv('EST DEF RTG Inputs_ATL_MEM.csv')
EST_DEF_RTG_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
trained_features_to_scale = X[EST_DEF_RTG_predict.columns]
trained_features_scaled = pd.DataFrame(EST_DEF_RTG_scaler.fit_transform(trained_features_to_scale), columns = trained_features_to_scale.columns)

EST_DEF_RTG_predict_scaled = pd.DataFrame(EST_DEF_RTG_scaler.transform(EST_DEF_RTG_predict), columns = EST_DEF_RTG_predict.columns)
EST_DEF_RTG_playoffs_scaled = pd.Series(target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten())

In [None]:
EST_DEF_RTG_predict_scaled['TS%_48tim_SOS_Use'] = EST_DEF_RTG_predict_scaled['TS%_48'] * EST_DEF_RTG_predict_scaled['SOS_Use']

EST_DEF_RTG_predict_scaled = EST_DEF_RTG_predict_scaled.drop('SOS_Use', axis = 1)

In [None]:
EST_DEF_RTG_model = joblib.load('EST DEF RTG_Elastic.pkl')

pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
X_train_PCA = pca.fit_transform(X_train_reduced)
EST_DEF_RTG_predict_scaled = EST_DEF_RTG_predict_scaled[X_train_reduced.columns]
EST_DEF_RTG_predict_scaled_PCA = pca.transform(EST_DEF_RTG_predict_scaled)

EST_DEF_RTG_predictions = EST_DEF_RTG_model.predict(EST_DEF_RTG_predict_scaled_PCA)
EST_DEF_RTG_predictions = target_scaler.inverse_transform(EST_DEF_RTG_predictions.reshape(-1, 1))
EST_DEF_RTG_predictions = pd.Series(EST_DEF_RTG_predictions.flatten())
EST_DEF_RTG_predictions.to_csv('EST_DEF_RTG_preds.csv')
EST_DEF_RTG_predictions

Unnamed: 0,0
0,109.086679
1,107.353615
2,107.35946
3,107.908131
4,107.335083
5,106.938859
6,106.581124
7,108.104936
8,106.223283
9,107.641585
