<a href="https://colab.research.google.com/github/ConorD28/NFL/blob/main/NFL_Playoffs_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
%matplotlib inline
inputs = pd.read_csv('NFL oPPG inputs.csv')
playoff_stats = pd.read_csv('playoff_stats_NFL.csv')

print(inputs.isnull().sum().sum()) #Check if there are NA values
print(playoff_stats.isnull().sum().sum())

0
0




# **Correlation/Scores**

In [None]:
from typing_extensions import final
import re
def remove_similar_cols(df):
  top_num = 8
  df['group'] = df.index.str.split('_').str[0]
  df = df.groupby('group', group_keys=False).apply(lambda x: x.loc[x['corrs'].abs().nlargest(top_num).index])
  df = df.drop('group', axis = 1)

  # Extract all unique phrases from the indices
  def extract_phrases(index):
      phrases = set()
      for idx in index:
          phrases.update(re.split(r'\*|&', idx))
      return phrases

  phrases = extract_phrases(df.index)

  # Initialize counters to track how many times each phrase has appeared before *, after *, or by itself
  phrase_counts = {phrase: {'before': 0, 'after': 0, 'isolated': 0} for phrase in phrases}

  # Create an empty DataFrame to store results
  final_rows = pd.DataFrame()

  # Track rows that have already been added
  seen_rows = set()

  # Process rows sorted by correlation (highest to lowest)
  for row in df.sort_values(by='corrs', ascending=False).itertuples():
      row_index = row.Index
      row_phrases = re.split(r'\*|&', row_index)

      # Determine positions for the phrases in the row
      positions = {'before': set(), 'after': set(), 'isolated': set()}

      if len(row_phrases) == 1:
          # Only one phrase (isolated)
          positions['isolated'].add(row_phrases[0])
      else:
          # Multiple phrases (before and after the *)
          positions['before'].add(row_phrases[0])
          positions['after'].add(row_phrases[-1])
          positions['isolated'].update(row_phrases)

      # Check if adding this row violates the limit for any phrase in any position
      if any(phrase_counts[phrase]['before'] >= top_num or phrase_counts[phrase]['after'] >= top_num or phrase_counts[phrase]['isolated'] >= top_num for phrase in positions['before'].union(positions['after'], positions['isolated'])):
          continue  # Skip this row if any phrase exceeds the limit in any position

      # Add the row if it doesn't exceed the limit for any phrase
      final_rows = pd.concat([final_rows, df.loc[[row_index]]])
      seen_rows.add(row_index)

      # Update the phrase counts for the phrases in this row and position
      for position in positions:
          for phrase in positions[position]:
              if phrase_counts[phrase][position] < top_num:
                  phrase_counts[phrase][position] += 1

  # Sort final results and reset the index for readability
  final_rows = final_rows.sort_values(by='corrs', ascending=False)
  return final_rows

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  cols = []
  correlations = []
  if isinstance(target, np.ndarray):
    target = pd.Series(target)
  for col in dataset.columns:
      cor2 = dataset.loc[:,col].corr(target) #scipy.stats.spearmanr(dataset.loc[:,col], target)[0] and scipy.stats.kendalltau(dataset.loc[:,col], target)[0]
      if abs(cor2) > threshold:
        data.append(dataset.loc[:,col]) #make list of columns that meet the threshold
        cols.append(col)
        correlations.append(cor2) #make list of correlations that meet the threshold

  if len(data) == 0:
     return pd.DataFrame()

  df = pd.DataFrame(data)
  df_len = len(df.columns)
  df.insert(df_len, 'corrs', correlations)
  df = df.sort_values(by=df.columns[-1], ascending=False, key = abs)
  df = remove_similar_cols(df)
  df = df.sort_values(by=df.columns[-1], ascending=False, key = abs)
  df = df.transpose()
  df_corrs = df.iloc[-1:, :]
  df = df.drop(df.tail(1).index)
  return df, df_corrs

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)

  range_y = y.max() - y.min()
  MAPE = -1
  Avg_Normalized_Score = -1
  print("Normalized by range:")
  if range_y == 0:
    print("Range was 0 so not Normalized by range:")
    Normalized_RMSE = -1
    Normalized_MAE = -1
    MAPE = abs((y_pred - y))/y
  else:
    Normalized_RMSE = (np.sqrt(MSE)/abs(range_y))
    Normalized_MAE = (MAE/abs(range_y))
    Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
    Avg_Normalized_Score = Avg_Normalized_Score.item() if isinstance(Avg_Normalized_Score, np.ndarray) else Avg_Normalized_Score
    print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}')
    print(f'Normalized RMSE:{ Normalized_RMSE:.1f}')
    print(f'Normalized MAE:{ Normalized_MAE:.2f}')

  #Calculate average error, handling single-element arrays and ensuring it is a scalar
  avg_error = np.mean(np.abs(y_pred-y))

  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  #print(f'Avg. Error:{avg_error:.4f}')
  return Avg_Normalized_Score, avg_error, MAPE

In [None]:
def Predict_Scores(model, X_tr, X_te, y_tr, y_te, t_sc):#, predict_df):
  y_train_pred = model.predict(X_tr)
  #print(y_train_pred)

  if len(y_te) != 0:
    y_test_pred = model.predict(X_te)
  else:
    y_test_pred = pd.DataFrame()
    Avg_N_Score_test = 0

  #Training Scores:
  Avg_N_Score_train, avg_error_train, MAPE = Scores(y_tr, y_train_pred)
  Avg_N_Score_train = Avg_N_Score_train.item() if isinstance(Avg_N_Score_train, np.ndarray) else Avg_N_Score_train
  avg_error_train = avg_error_train.reshape(-1, 1) #
  avg_error_test = 0
  avg_error_test_transformed = 0
  MAPE_test = 0

  #Test Predictions:
  if len(y_te) != 0:
    print("Test predictions:")
    #Testing Scores:
    Avg_N_Score_test, avg_error_test, MAPE_test = Scores(y_te, y_test_pred)
    MAPE_test = MAPE_test.item() if isinstance(MAPE_test, np.ndarray) else MAPE_test
    print(f'MAPE:{ MAPE_test:.2f}')
    print()
    print('after inverse transform, testing off by:')
    y_test_pred_transformed = t_sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test_pred_transformed = pd.Series(y_test_pred_transformed.flatten())
    y_te_transformed = t_sc.inverse_transform(y_te.reshape(-1, 1))
    y_te_transformed = pd.Series(y_te_transformed.flatten())
    avg_error_test_transformed = np.abs(y_te_transformed - y_test_pred_transformed)
    #print(avg_error_test_transformed)

  print('after inverse transform, training off by:')
  y_train_pred_transformed = t_sc.inverse_transform(y_train_pred.reshape(-1, 1)) # Reshape y_train_pred
  y_train_pred_transformed = pd.Series(y_train_pred_transformed.flatten())
  y_tr_transformed = t_sc.inverse_transform(y_tr.values.reshape(-1, 1))
  y_tr_transformed = pd.Series(y_tr_transformed.flatten())
  avg_error_train_transformed = np.mean(np.abs(y_tr_transformed - y_train_pred_transformed))
  #print(avg_error_train_transformed)
  #print(y_tr_transformed - y_train_pred_transformed)
  print(y_train_pred_transformed)

  #Predict:
  #predictions = model.predict(predict_df)

  return Avg_N_Score_train, Avg_N_Score_test, avg_error_train_transformed, avg_error_test_transformed, MAPE_test#, predictions

# **ML Tuning Algorithms**

In [None]:
!pip install optuna
import optuna
!pip install joblib
import joblib
from sklearn.model_selection import cross_val_score
def Ridge_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
    def objective(trial, cv_runs, X_train, y_train):
      alpha = trial.suggest_float("alpha", 2, 10, log=True)#1e-4, 10.0; Alpha is the regularization strength
      solver = trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])

      model = Ridge(alpha=alpha, solver=solver, random_state=28)
      score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_mean_squared_error").mean()
      return -score  # Minimize the MSE

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

    print("Best trial:")
    trial = study.best_trial
    print(f"  Params: {trial.params}")

    best_model = Ridge(**trial.params, random_state=28)
    best_model.fit(X_train, y_train)
    #joblib.dump(best_model, 'PPG_Ridge.pkl')
    return best_model

In [None]:
def Lasso_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Lasso_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 2, 10, log=True)##1e-4, 10.0; Regularization strength
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = Lasso(alpha=alpha, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Lasso_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = Lasso(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_Lasso.pkl')
  return best_model

In [None]:
def Elastic_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def Elastic_objective(trial, cv_runs, X_train, y_train):
    alpha = trial.suggest_float("alpha", 2, 10, log=True)#1e-4, 10.0; Regularization strength
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)  # Mixing ratio between Lasso and Ridge
    max_iter = trial.suggest_int("max_iter", 1000, 10000, step=100)  # Max iterations
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)  # Tolerance for stopping criteria

    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, tol=tol, random_state=28)
    score = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_mean_squared_error").mean()
    return -score  # Minimize the MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: Elastic_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)

  print("Best trial:")
  trial = study.best_trial
  print(f"  Params: {trial.params}")

  best_model = ElasticNet(**trial.params, random_state=28)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_Elastic.pkl')
  return best_model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
def GBR_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def GBR_objective(trial, cv_runs, X_train, y_train):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True), #1e-3, 0.5
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
    }
    model = GradientBoostingRegressor(**params, random_state=28)
    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring="neg_mean_squared_error")
    return -scores.mean()  # Minimize MSE

  study = optuna.create_study(direction="minimize")
  study.optimize(lambda trial: GBR_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = GradientBoostingRegressor(**study.best_params, random_state=28)
  best_model.fit(X_train, y_train)
  joblib.dump(best_model, 'PPG_GBR.pkl')
  return best_model

In [None]:
def SVR_rbf_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def rbf_objective(trial, cv_runs, X_train, y_train):
      C = trial.suggest_float('C', 1e-3, 1e3, log=True)
      gamma = trial.suggest_float('gamma', 1e-4, 1e1, log=True)
      epsilon = trial.suggest_float('epsilon', 1e-4, 1e1, log=True)

      model = SVR(kernel='rbf', C=C, gamma=gamma, epsilon=epsilon)
      scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_mean_squared_error')
      mean_score = np.mean(scores)
      return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: rbf_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)
  best_model = SVR(kernel='rbf', C=study.best_params['C'], gamma=study.best_params['gamma'],
                 epsilon=study.best_params['epsilon'])
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_Rbf.pkl')
  return best_model

In [None]:
def SVR_poly_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def poly_objective(trial, cv_runs, X_train, y_train):
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    gamma = trial.suggest_float('gamma', 1e-4, 1e1, log=True)
    epsilon = trial.suggest_float('epsilon', 1e-4, 1e1, log=True)
    degree = trial.suggest_int('degree', 2, 5)  # Degrees 2 through 5
    coef0 = trial.suggest_float('coef0', 0.0, 10.0)  # Coefficient in kernel function

    model = SVR(kernel='poly', C=C, gamma=gamma, epsilon=epsilon, degree=degree, coef0=coef0)
    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_mean_squared_error')
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: poly_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = SVR(kernel='poly', C=study.best_params['C'], gamma=study.best_params['gamma'],
                  epsilon=study.best_params['epsilon'], degree=study.best_params['degree'],
                  coef0=study.best_params['coef0'])
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_Poly.pkl')
  return best_model

In [None]:
def SVR_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def linear_svr_objective(trial, cv_runs, X_train, y_train):
    C = trial.suggest_float('C', 1e-4, .8, log=True) #1e-4, 1e2
    epsilon = trial.suggest_float('epsilon', 1e-4, 1.0, log=True)
    model = LinearSVR(C=C, epsilon=epsilon, random_state=28, max_iter=100000)

    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_mean_squared_error')
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: linear_svr_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = LinearSVR(C=study.best_params['C'], epsilon=study.best_params['epsilon'],
                        random_state=28, max_iter=10000)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_SVR.pkl')
  return best_model

In [None]:
def RF_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def RF_objective(trial, cv_runs, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 50, 100)#300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 10, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
        max_features=max_features, random_state=28, n_jobs=-1)

    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_mean_squared_error', n_jobs=-1)
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: RF_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_model = RandomForestRegressor(
      n_estimators=study.best_params['n_estimators'], max_depth=study.best_params['max_depth'],
      min_samples_split=study.best_params['min_samples_split'], min_samples_leaf=study.best_params['min_samples_leaf'],
      max_features=study.best_params['max_features'], random_state=28, n_jobs=-1)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_RF.pkl')
  return best_model

In [None]:
def BR_tune(X_train, y_train, cv_choice, num_trials, timeout_choice):
  def BR_objective(trial, cv_runs, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_samples = trial.suggest_float('max_samples', 0.5, 1.0)
    max_features = trial.suggest_float('max_features', 0.5, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 30)  # For DecisionTreeRegressor

    base_estimator = DecisionTreeRegressor(max_depth=max_depth, random_state=28)
    model = BaggingRegressor(estimator=base_estimator, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features, random_state=28, n_jobs=-1)

    scores = cross_val_score(model, X_train, y_train, cv=cv_runs, scoring='neg_mean_squared_error', n_jobs=-1)
    mean_score = np.mean(scores)
    return -mean_score

  study = optuna.create_study(direction='minimize')
  study.optimize(lambda trial: BR_objective(trial, cv_choice, X_train, y_train), n_trials=num_trials, timeout=timeout_choice)
  print("Best parameters:", study.best_params)

  best_base_estimator = DecisionTreeRegressor(max_depth=study.best_params['max_depth'], random_state=28)
  best_model = BaggingRegressor(estimator=best_base_estimator, n_estimators=study.best_params['n_estimators'],
      max_samples=study.best_params['max_samples'], max_features=study.best_params['max_features'],
      random_state=28, n_jobs=-1)
  best_model.fit(X_train, y_train)
  #joblib.dump(best_model, 'PPG_BR.pkl')
  return best_model

# **ML Algorithms**

In [None]:
def RLE_Model(xTrain, xTest, yTrain, yTest, choice, predict_df, tar_sca): #Function to run Ridge, Lasso, or ElasticNet model
  if(choice=="Ridge"):
    pipeline = Ridge_tune(xTrain, yTrain, 10, 200, 15)

  if(choice=="Lasso"):
    pipeline = Lasso_tune(xTrain, yTrain, 10, 200, 15)

  if(choice=="Elastic"):
    pipeline = Elastic_tune(xTrain, yTrain, 10, 200, 15)

  modelResults = Predict_Scores(pipeline, xTrain, xTest, yTrain, yTest, tar_sca)
  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')
  return modelResults

In [None]:
def GBR_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = GBR_tune(xTrain, yTrain, 3, 200, 15)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

In [None]:
from sklearn.svm import SVR
def SVM_models(xTrain, xTest, yTrain, yTest, choice, predict_df, tar_sca):
  if(choice=="rbf"):
    model = SVR_rbf_tune(xTrain, yTrain, 10, 200, 15)
    modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)

  if(choice=="poly"):
    model = SVR_poly_tune(xTrain, yTrain, 10, 200, 15)
    modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)

  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

In [None]:
from sklearn.svm import LinearSVR
def SVR_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = SVR_tune(xTrain, yTrain, 10, 200, 15)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

In [None]:
from sklearn.ensemble import RandomForestRegressor
def RF_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = RF_tune(xTrain, yTrain, 3, 200, 15)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
def BR_model(xTrain, xTest, yTrain, yTest, predict_df, tar_sca):
  model = BR_tune(xTrain, yTrain, 3, 200, 15)
  modelResults = Predict_Scores(model, xTrain, xTest, yTrain, yTest, tar_sca)
  return modelResults

# **Inputs/LOOCV Function**

In [None]:
def get_inputs(data_frame, y, tr_index, te_index, scaler_choice, thresh, if_final):
#Feature Importance:
  if scaler_choice == "MinMax":
    scaler = MinMaxScaler()
    scaler2 = MinMaxScaler()
  else:
    scaler = StandardScaler()
    scaler2 = StandardScaler()

  if if_final == 'yes':
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame, pd.DataFrame(), y, pd.DataFrame()
  else:
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame.iloc[tr_index], data_frame.iloc[[te_index]], y.iloc[tr_index], y.iloc[te_index]

  train_scaled = pd.DataFrame(scaler.fit_transform(data_scaled_train), columns = data_frame.columns)
  y_train = pd.Series(scaler2.fit_transform(y_train.values.reshape(-1, 1)).flatten())
  train_scaled_correlated, correlations_df = correlation(train_scaled, thresh, y_train) #
  train_scaled_correlated = pd.DataFrame(train_scaled_correlated)

  if if_final == 'no':
    y_test = pd.Series(y_test)
    y_test = y_test.values.reshape(-1, 1)
    y_test = scaler2.transform(y_test).flatten()
    test_scaled = pd.DataFrame(scaler.transform(data_scaled_test), columns=data_frame.columns)
    test_scaled_correlated = test_scaled.loc[:, train_scaled_correlated.columns] #Test data with only correlated inputs
  else:
    test_scaled_correlated = data_scaled_test

  print(len(train_scaled_correlated.columns))

  return train_scaled_correlated, test_scaled_correlated, scaler, scaler2, y_train, y_test, correlations_df

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import PCA
def reduce_df(x_tr, x_te, reduction_choice, if_final):
  pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
  lle = LocallyLinearEmbedding(n_components=3, n_neighbors=5, random_state=28) #n_components=2 is default, 850

  if reduction_choice == "PCA":
    X_tr_PCA = pca.fit_transform(x_tr)
    if if_final == "no":
      X_te_PCA = pca.transform(x_te)
    else:
      X_te_PCA = x_te
    #print("Principal axes:\n", pca.components_.tolist())
    #print("Explained variance:\n", pca.explained_variance_.tolist())
    print("Mean:", pca.mean_)
    return X_tr_PCA, X_te_PCA, pca
  else:
    X_unrolled_train = lle.fit_transform(x_tr)
    if if_final == "no":
      X_unrolled_test = lle.transform(x_te)
    else:
      X_unrolled_test = x_te
    return X_unrolled_train, X_unrolled_test, lle

In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Define the function that processes each fold of LOO-CV and can make final model
def process_fold(train_index, test_index, X, y):
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", 0.1, 'no')

    #Select only top 40 most correlated columns:
    common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
    if not common_columns:
        # Handle the case where there are no correlated columns
        return None
    X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
    X_train_with_corrs = X_train_with_corrs.transpose()
    X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
    X_train_with_corrs = X_train_with_corrs.head(50)
    X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
    X_train_reduced = X_train_reduced.transpose()
    X_train_reduced.reset_index(drop = True, inplace = True)
    X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)

    #Reduce with PCA or LLE:
    print(len(X_train_reduced.columns))
    X_train_reduced_PCA, X_test_reduced_PCA, pca_reducer = reduce_df(X_train_reduced, X_test_reduced, "PCA", 'no')
    #X_train_reduced_LLE, X_test_reduced_LLE, lle_reducer = reduce_df(X_train_reduced, X_test_reduced, "LLE", 'no')

    #Create the model:
    #model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Ridge", X_test_reduced_LLE, scalerY)
    #model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Lasso", X_test_reduced_LLE, scalerY)
    #model = RF_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
    #model = SVR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
    #model = GBR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
    #model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Elastic", X_test_reduced_LLE, scalerY)

    #model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "rbf", X_test_reduced_LLE, scalerY)
    #model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "poly", X_test_reduced_LLE, scalerY)
    #model = BR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)

    #model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Ridge", X_test_reduced_PCA, scalerY)
    #model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Lasso", X_test_reduced_PCA, scalerY)
    #model = RF_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
    #model = SVR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
    #model = GBR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
    model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Elastic", X_test_reduced_PCA, scalerY)

    #model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "rbf", X_test_reduced_PCA, scalerY)
    #model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "poly", X_test_reduced_PCA, scalerY)
    #model = BR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)

    return model #Return the model for each fold

# **Test 1 Fold**

In [None]:
X = inputs
y = playoff_stats['oPPG_Playoffs']
len_df = len(X)
train_index = list(range(len_df-1))
test_index = list(range((len_df-1), len_df))
X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, nums, test_index[0], "", 0.1, 'no')

In [None]:
X_train.columns

In [None]:
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(50)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)
X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)

#Reduce with PCA or LLE:
X_train_reduced_PCA, X_test_reduced_PCA, pca_reducer = reduce_df(X_train_reduced, X_test_reduced, "PCA", 'no')
#X_train_reduced_LLE, X_test_reduced_LLE, lle_reducer = reduce_df(X_train_reduced, X_test_reduced, "LLE", 'no')

#Create the model:
#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Ridge", X_test_reduced_LLE, scalerY)
#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Lasso", X_test_reduced_LLE, scalerY)
#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Elastic", X_test_reduced_LLE, scalerY) #.2, 1.3
#model = GBR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "rbf", X_test_reduced_LLE, scalerY)
#model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "poly", X_test_reduced_LLE, scalerY)
#model = SVR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = RF_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = BR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)

#model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Ridge", X_test_reduced_PCA, scalerY) #.2, 1.2
#model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Lasso", X_test_reduced_PCA, scalerY) #.2, 1.2
model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Elastic", X_test_reduced_PCA, scalerY) #.2, 1.2
#model = GBR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY) #0, 1.5
#model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "rbf", X_test_reduced_PCA, scalerY) #.1, 1.3
#model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "poly", X_test_reduced_PCA, scalerY) #.1, 1.2
#model = SVR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY) #.2, 1.2
#model = RF_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY) #.1, 1.2
#model = BR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY) #.1, 1.5

# **Run LOO-CV**

In [None]:
# Initialize LOO and Joblib Parallel
X = inputs
y = playoff_stats['oPPG_Playoffs'] #PPG_Playoffs, Points/Att_Playoffs
loo = LeaveOneOut()
results = Parallel(n_jobs=-1)(delayed(process_fold)(train_idx, test_idx, X, y)
                              for train_idx, test_idx in loo.split(X))
train_N_scores = 0
test_MAPE_scores = 0
train_avg_scores = 0
test_avg_scores = 0
len_df = len(X)

for i in range(len_df):
  train_N_scores = results[i][0] + train_N_scores
for i in range(len_df):
  train_avg_scores = results[i][2] + train_avg_scores
for i in range(len_df):
  test_avg_scores = results[i][3] + test_avg_scores
for i in range(len_df):
  test_MAPE_scores = results[i][4] + test_MAPE_scores

print(f'AVG Normalized train acuraacy: {((train_N_scores/len_df)):.3f}')
print(f'AVG inv transformed train accuracy: {((train_avg_scores.flatten()[0]/len_df)):.3f}')
print(f'AVG inv transformed test accuracy: {((test_avg_scores[0]/len_df)):.3f}')
print(f'AVG MAPE test accuracy: {((test_MAPE_scores/len_df)):.4f}')

# **Final Model**

In [None]:
X = inputs#.iloc[:, 0:616]
y = playoff_stats['oPPG_Playoffs'] #Points/Att_Playoffs, PPG_Playoffs
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

#X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", 0.1, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(50)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

X_test_reduced = X_test
X_train_reduced_PCA, X_test_reduced_PCA, pca_reducer = reduce_df(X_train_reduced, X_test_reduced, "PCA", 'yes')
#X_train_reduced_LLE, X_test_reduced_LLE, lle_reducer = reduce_df(X_train_reduced, X_test_reduced, "LLE", 'yes')

#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Ridge", X_test_reduced_LLE, scalerY)
#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Lasso", X_test_reduced_LLE, scalerY)
#model = RLE_Model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "Elastic", X_test_reduced_LLE, scalerY)
#model = GBR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "rbf", X_test_reduced_LLE, scalerY)
#model = SVM_models(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, "poly", X_test_reduced_LLE, scalerY)
#model = SVR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = RF_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)
#model = BR_model(X_train_reduced_LLE, X_test_reduced_LLE, Y_train, Y_test, X_test_reduced_LLE, scalerY)

#model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Ridge", X_test_reduced_PCA, scalerY)
#model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Lasso", X_test_reduced_PCA, scalerY)
model = RLE_Model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "Elastic", X_test_reduced_PCA, scalerY)
#model = GBR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
#model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "rbf", X_test_reduced_PCA, scalerY)
#model = SVM_models(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, "poly", X_test_reduced_PCA, scalerY)
#model = SVR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
#model = RF_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)
#model = BR_model(X_train_reduced_PCA, X_test_reduced_PCA, Y_train, Y_test, X_test_reduced_PCA, scalerY)

print(f'AVG Normalized Score: {((model[0])):.3f}')
print(f'AVG Error: {((model[2])):.3f}')

# **oPPG**



**Ridge Folds:** PCA - 50, 2, 10
*   AVG Normalized train accuracy: 0.142
*   AVG inv transformed train accuracy: 2.049
*   AVG inv transformed test accuracy: 5.785
*   AVG MAPE test accuracy: 0.2373

**Ridge Final**:
*   AVG Normalized Score: 0.141
*   AVG Error: 2.055
*   Normalized RMSE:0.2
*   Normalized MAE:0.13

In [None]:
import joblib
import pandas as pd
import numpy as np
oPPG_model = joblib.load('oPPG_Ridge.pkl')
oPPG_inputs = pd.read_csv('oPPG Inputs.csv')
inputs = pd.read_csv('NFL oPPG inputs.csv')
playoff_stats = pd.read_csv('playoff_stats_NFL.csv')

In [None]:
X = inputs
y = playoff_stats['oPPG_Playoffs'] #Points/Att_Playoffs, PPG_Playoffs
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scaler_oPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", 0.1, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(50)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

X_test_reduced = X_test
#X_train_reduced_LLE, X_test_reduced_LLE, oPPG_lle_reducer = reduce_df(X_train_reduced, X_test_reduced, "LLE", 'yes')
X_train_reduced_PCA, X_test_reduced_PCA, oPPG_pca_reducer = reduce_df(X_train_reduced, X_test_reduced, "PCA", 'yes')

In [None]:
oPPG_scaler = StandardScaler()
trained_features_to_scale = X[X_train_reduced.columns]
trained_features_scaled = oPPG_scaler.fit_transform(trained_features_to_scale)

oPPG_inputs_scaled = oPPG_scaler.transform(oPPG_inputs)
oPPG_inputs_scaled_trans = oPPG_pca_reducer.transform(oPPG_inputs_scaled)
predictions = oPPG_model.predict(oPPG_inputs_scaled_trans)
predictions = pd.Series(predictions)

# **PPG**

**Linear SVR Folds:**
* AVG Normalized train accuracy: 0.171
* AVG inv transformed train accuracy: 2.917
* AVG inv transformed test accuracy: 4.572

**Linear SVR Final:**
* Avg. Normalized Score:0.2
* Normalized RMSE:0.3
* Normalized MAE:0.21
* Avg. Error: 0.8226



In [None]:
import joblib
import pandas as pd
import numpy as np
PPG_model = joblib.load('PPG_SVR.pkl')
PPG_inputs = pd.read_csv('PPG_Inputs.csv')
inputs = pd.read_csv('NFL PPG inputs_Use.csv')
playoff_stats = pd.read_csv('playoff_stats_NFL.csv')

In [None]:
X = inputs
y = playoff_stats['PPG_Playoffs'] #Points/Att_Playoffs, PPG_Playoffs
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scaler_PPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", 0.1, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(50)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

X_test_reduced = X_test
#X_train_reduced_LLE, X_test_reduced_LLE, PPG_lle_reducer = reduce_df(X_train_reduced, X_test_reduced, "LLE", 'yes')
X_train_reduced_PCA, X_test_reduced_PCA, PPG_pca_reducer = reduce_df(X_train_reduced, X_test_reduced, "PCA", 'yes')

In [None]:
y_pred = PPG_model.predict(X_train_reduced_PCA)
MSE = mean_squared_error(Y_train, y_pred)
MAE = mean_absolute_error(Y_train, y_pred)

range_y = Y_train.max() - Y_train.min()
Normalized_RMSE = (np.sqrt(MSE)/abs(range_y))
Normalized_MAE = (MAE/abs(range_y))
Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
Avg_Normalized_Score = Avg_Normalized_Score.item() if isinstance(Avg_Normalized_Score, np.ndarray) else Avg_Normalized_Score
print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}')
print(f'Normalized RMSE:{ Normalized_RMSE:.1f}')
print(f'Normalized MAE:{ Normalized_MAE:.2f}')

#Calculate avg error, handling single-element arrays and ensuring it is a scalar
avg_error = np.mean(np.abs(y_pred-Y_train))

print(f'MAE:{ MAE:.3f}')
print(f'RMSE:{ np.sqrt(MSE):.3f}')
print(f'Avg. Error:{avg_error:.4f}')

In [None]:
PPG_scaler = StandardScaler()
trained_features_to_scale = X[X_train_reduced.columns]
trained_features_scaled = PPG_scaler.fit_transform(trained_features_to_scale)

PPG_inputs_scaled = PPG_scaler.transform(PPG_inputs)
PPG_inputs_scaled_trans = PPG_pca_reducer.transform(PPG_inputs_scaled)
predictions = PPG_model.predict(PPG_inputs_scaled_trans)
predictions_inv_trans = scalerY.inverse_transform(predictions.reshape(-1, 1))
predictions_inv_trans = pd.Series(predictions_inv_trans.flatten())
round(predictions_inv_trans, 1)
predictions_inv_trans.to_csv('PPG_predictions.csv')