<a href="https://colab.research.google.com/github/ConorD28/March-Madness/blob/main/March_Madness_B4_Chip_2025_GitHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
inputs = pd.read_csv('MM_b4_Ch_Upload.csv')
playoff_stats = inputs.iloc[:, -4:]
inputs = inputs.iloc[:, :-4]

print(inputs.isnull().sum().sum()) #Check if there are NA values
print(playoff_stats.isnull().sum().sum())

0
0




# **Correlation/Scores**

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  cols = []
  correlations = []
  #corS = 0
  if isinstance(target, np.ndarray):
    target = pd.Series(target)
  for col in dataset.columns:
      #print(dataset.loc[:,col])
      #print(col)
      corS = dataset.loc[:,col].corr(target, method='spearman') # 'kendall'
      corP = dataset.loc[:,col].corr(target)
      if (abs(corP) > threshold) or (abs(corS) > threshold):
        cor2 = max(abs(corP), abs(corS))
        data.append(dataset.loc[:,col]) #make list of columns that meet the threshold
        cols.append(col)
        correlations.append(cor2) #make list of correlations that meet the threshold
  if len(data) == 0:
     return pd.DataFrame()

  df = pd.DataFrame(data)
  df_len = len(df.columns)
  df.insert(df_len, 'corrs', correlations)
  df = df.sort_values(by=df.columns[-1], ascending=False, key = abs)
  df = df.transpose()
  df_corrs = df.iloc[-1:, :]
  df = df.drop(df.tail(1).index)
  return df, df_corrs

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)

  range_y = y.max() - y.min()
  Normalized_RMSE = (np.sqrt(MSE)/abs(range_y))
  Normalized_MAE = (MAE/abs(range_y))
  #print(f'Normalized RMSE:{ Normalized_RMSE:.2f}')
  #print(f'Normalized MAE:{ Normalized_MAE:.2f}')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Normalized_RMSE, Normalized_MAE

In [None]:
def Predict_Scores(model, X_tr, X_te, y_tr, y_te, t_sc):#, predict_df):
  y_train_pred = model.predict(X_tr)
  #print(y_train_pred)

  if len(y_te) != 0:
    y_test_pred = model.predict(X_te)
  else:
    y_test_pred = pd.DataFrame()
  #print('y test values:')
  #print(y_test_pred)

  #print('Training Scores:')
  NRMSE_tr, NMAE_tr = Scores(y_tr, y_train_pred)

  #print('after inverse transform, training off by:')
  y_train_pred_transformed = t_sc.inverse_transform(y_train_pred.reshape(-1, 1)) # Reshape y_train_pred
  y_train_pred_transformed = pd.Series(y_train_pred_transformed.flatten())
  y_tr_transformed = t_sc.inverse_transform(y_tr.values.reshape(-1, 1))
  y_tr_transformed = pd.Series(y_tr_transformed.flatten())
  inv_error_tr_transformed = np.abs(y_tr_transformed - y_train_pred_transformed)
  #print('y training values:')
  #print(y_train_pred_transformed)

  error_test = 0
  inv_error_test_transformed = 0
  y_te_transformed = 0
  y_test_pred_transformed = 0
  #Test Predictions:
  if len(y_te) != 0:
    error_test = y_te - y_test_pred
    y_te_transformed = t_sc.inverse_transform(y_te.reshape(-1, 1))
    y_te_transformed = y_te_transformed.flatten()
    y_test_pred_transformed = t_sc.inverse_transform(y_test_pred.reshape(-1, 1))
    y_test_pred_transformed = y_test_pred_transformed.flatten()
    y_te_transformed = t_sc.inverse_transform(y_te.reshape(-1, 1))
    inv_error_test_transformed = np.abs(y_te_transformed - y_test_pred_transformed)
    #print(y_test_pred_transformed)

  #Predict:
  #predictions = model.predict(predict_df)

  return NRMSE_tr, NMAE_tr, error_test, inv_error_tr_transformed, inv_error_test_transformed, y_te, y_test_pred_transformed, y_train_pred_transformed#, predictions

# **Inputs/LOOCV Function**

In [None]:
from joblib import Parallel, delayed
def corr_matrix_reduce(x_train, x_test):
  def compute_corr_row(i, data):
      return [data.iloc[:, i].corr(data.iloc[:, j]) for j in range(data.shape[1])]

  correlation_matrix = Parallel(n_jobs=-1)(
      delayed(compute_corr_row)(i, x_train) for i in range(x_train.shape[1])
  )

  correlation_matrix = pd.DataFrame(correlation_matrix, columns=x_train.columns, index=x_train.columns)

  # Step 2: Reduce features based on correlation threshold
  def reduce_features(corr_matrix, threshold=0.9):
    #Reduce features by removing one feature from any pair with a correlation above the threshold.
      to_drop = set()
      for i in range(corr_matrix.shape[0]):
          for j in range(i + 1, corr_matrix.shape[1]):
              if abs(corr_matrix.iloc[i, j]) > threshold:
                  # Add the second feature to the drop list
                  to_drop.add(corr_matrix.columns[j])
      return to_drop

  threshold = 0.9
  features_to_drop = reduce_features(correlation_matrix, threshold)

  # Drop the features from the original dataset
  x_train_reduced = x_train.drop(columns=features_to_drop)
  if x_test.empty != True:
    x_test = x_test.drop(columns=features_to_drop)

  # Step 3: Print results
  print("Original features:", x_train.shape[1])
  print("Features to drop:", len(features_to_drop))
  print("Reduced features:", x_train_reduced.shape[1])
  return x_train_reduced, x_test

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
def reduce_df(x_tr, x_te, y_tr, reduction_choice, if_final):
  if reduction_choice == "PLS":
    pls = PLSRegression(n_components=3)
    X_tr_pls = pls.fit_transform(x_tr, y_tr)[0]  #Extract transformed features
    if x_te.empty:
      X_te_pls = pd.DataFrame(columns=["PLS1", "PLS2", "PLS3"])  #Create empty DataFrame with correct columns if x_te is empty
    else:
      X_te_pls = pls.transform(x_te)
    X_tr_pls = pd.DataFrame(X_tr_pls, columns=["PLS1", "PLS2", "PLS3"])
    X_te_pls = pd.DataFrame(X_te_pls, columns=["PLS1", "PLS2", "PLS3"])
    #print("Explained variance in X:", np.round(pls.x_scores_.var(axis=0) / x_tr.var(axis=0).sum(), 3))
    #print("Explained variance in Y:", np.round(pls.y_scores_.var(axis=0) / y_tr.var(), 3))
    return X_tr_pls, X_te_pls, pls

  if reduction_choice == "PCA":
    pca=PCA(n_components = 3, random_state=28) #n_components = None, 420
    X_tr_PCA = pca.fit_transform(x_tr)
    if if_final == "no":
      X_te_PCA = pca.transform(x_te)
    else:
      X_te_PCA = x_te
    #print("Principal axes:\n", pca.components_.tolist())
    #print("Explained variance:\n", pca.explained_variance_.tolist())
    print("Mean:", pca.mean_)
    return X_tr_PCA, X_te_PCA, pca

In [None]:
def get_inputs(data_frame, y, tr_index, te_index, scaler_choice, thresh, if_final):
#Feature Importance:
  if scaler_choice == "MMS":
    scaler = MinMaxScaler()
    scaler2 = MinMaxScaler()
  else:
    scaler = StandardScaler()
    scaler2 = StandardScaler()

  if if_final == 'yes':
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame, pd.DataFrame(), y, pd.DataFrame()
  else:
    data_scaled_train, data_scaled_test, y_train, y_test = data_frame.iloc[tr_index], data_frame.iloc[[te_index]], y.iloc[tr_index], y.iloc[te_index]

  train_scaled = pd.DataFrame(scaler.fit_transform(data_scaled_train), columns = data_frame.columns)
  SOS_train_scaled = train_scaled.mul(train_scaled["SOS"], axis=0)
  SOS_train_scaled = SOS_train_scaled.add_suffix('*SOS')
  SOS_Opp_Ch_train_scaled = train_scaled.mul(train_scaled["SOS_Opp_Ch"], axis=0)
  SOS_Opp_Ch_train_scaled = SOS_Opp_Ch_train_scaled.add_suffix('*SOS_Opp_Ch')
  train_scaled = pd.concat([train_scaled, SOS_train_scaled], axis=1)
  train_scaled = pd.concat([train_scaled, SOS_Opp_Ch_train_scaled], axis=1)
  y_train = pd.Series(scaler2.fit_transform(y_train.values.reshape(-1, 1)).flatten())
  train_scaled_correlated, correlations_df = correlation(train_scaled, thresh, y_train) #
  train_scaled_correlated = pd.DataFrame(train_scaled_correlated)

  # prev_train_scaled_correlated = train_scaled_correlated
  # most_corr = correlations_df.columns[0]
  # second_most_corr = correlations_df.columns[1]
  # third_most_corr = correlations_df.columns[2]
  # most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[most_corr], axis=0)
  # most_corr_train = most_corr_train.add_suffix("*")
  # most_corr_train = most_corr_train.add_suffix(most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, most_corr_train], axis=1)

  # second_most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[second_most_corr], axis=0)
  # second_most_corr_train = second_most_corr_train.add_suffix("*")
  # second_most_corr_train = second_most_corr_train.add_suffix(second_most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, second_most_corr_train], axis=1)

  # third_most_corr_train = prev_train_scaled_correlated.mul(train_scaled_correlated[third_most_corr], axis=0)
  # third_most_corr_train = third_most_corr_train.add_suffix("*")
  # third_most_corr_train = third_most_corr_train.add_suffix(third_most_corr)
  # train_scaled_correlated = pd.concat([train_scaled_correlated, third_most_corr_train], axis=1)
  train_scaled_correlated, correlations_df = correlation(train_scaled_correlated, thresh, y_train)

  if if_final == 'no':
    y_test = pd.Series(y_test)
    y_test = y_test.values.reshape(-1, 1)
    y_test = scaler2.transform(y_test).flatten()
    test_scaled = pd.DataFrame(scaler.transform(data_scaled_test), columns=data_frame.columns)
    SOS_test_scaled = test_scaled.mul(test_scaled["SOS"], axis=0)
    SOS_test_scaled = SOS_test_scaled.add_suffix('*SOS')
    SOS_Opp_Ch_test_scaled = test_scaled.mul(test_scaled["SOS_Opp_Ch"], axis=0)
    SOS_Opp_Ch_test_scaled = SOS_Opp_Ch_test_scaled.add_suffix('*SOS_Opp_Ch')
    test_scaled = pd.concat([test_scaled, SOS_test_scaled], axis=1)
    test_scaled = pd.concat([test_scaled, SOS_Opp_Ch_test_scaled], axis=1)

    # prev_test_scaled = test_scaled
    # most_corr_test = prev_test_scaled.mul(test_scaled[most_corr], axis=0)
    # most_corr_test = most_corr_test.add_suffix("*")
    # most_corr_test = most_corr_test.add_suffix(most_corr)
    # test_scaled = pd.concat([test_scaled, most_corr_test], axis=1)

    # second_most_corr_test = prev_test_scaled.mul(test_scaled[second_most_corr], axis=0)
    # second_most_corr_test = second_most_corr_test.add_suffix("*")
    # second_most_corr_test = second_most_corr_test.add_suffix(second_most_corr)
    # test_scaled = pd.concat([test_scaled, second_most_corr_test], axis=1)

    # third_most_corr_test = prev_test_scaled.mul(test_scaled[third_most_corr], axis=0)
    # third_most_corr_test = third_most_corr_test.add_suffix("*")
    # third_most_corr_test = third_most_corr_test.add_suffix(third_most_corr)
    # test_scaled = pd.concat([test_scaled, third_most_corr_test], axis=1)

    test_scaled_correlated = test_scaled.loc[:, train_scaled_correlated.columns] #Test data with only correlated inputs
  else:
    test_scaled_correlated = data_scaled_test

  train_scaled_correlated, test_scaled_correlated = corr_matrix_reduce(train_scaled_correlated, test_scaled_correlated)
  correlations_df2 = correlations_df.loc[:, train_scaled_correlated.columns]

  return train_scaled_correlated, test_scaled_correlated, scaler, scaler2, y_train, y_test, correlations_df#,correlations_df2

In [None]:
def reduce_and_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, reduction_choice, scaler_target, is_final_model, model_choice):
  if reduction_choice == 'PLS':
    X_tr_reduced, X_te_reduced, PLS_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PLS", is_final_model)
  elif reduction_choice == 'PCA':
    X_tr_reduced, X_te_reduced, PCA_reducer = reduce_df(X_tr_reduced, X_te_reduced, Y_tr, "PCA", is_final_model)

  if model_choice == 'Ridge':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Ridge", X_te_reduced, scaler_target)
  elif model_choice == 'Lasso':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Lasso", X_te_reduced, scaler_target)
  elif model_choice == 'Elastic':
    model = RLE_Model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "Elastic", X_te_reduced, scaler_target)
  elif model_choice == 'GBR':
    model = GBR_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)
  elif model_choice == 'BR':
    model = BR_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)
  elif model_choice == 'SVR':
    model = SVR_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)
  elif model_choice == 'rbf':
    model = SVM_models(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "rbf", X_te_reduced, scaler_target)
  elif model_choice == 'poly':
    model = SVM_models(X_tr_reduced, X_te_reduced, Y_tr, Y_te, "poly", X_te_reduced, scaler_target)
  elif model_choice == 'RF':
    model = RF_model(X_tr_reduced, X_te_reduced, Y_tr, Y_te, X_te_reduced, scaler_target)

  return model

In [None]:
from joblib import Parallel, delayed
from sklearn.model_selection import LeaveOneOut
# Define the function that processes each fold of LOO-CV and can make final model
def process_fold(train_index, test_index, X, y, reduce_choice, corr_thresh, scaling_choice, modeling_choice):
  if scaling_choice == "MMS":
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", corr_thresh, 'no')
  else:
    X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "", corr_thresh, 'no')

  common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
  if not common_columns:
      # Handle the case where there are no correlated columns
      return None
  X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
  X_train_with_corrs = X_train_with_corrs.transpose()
  X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
  X_train_with_corrs = X_train_with_corrs.head(53)
  X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
  X_train_reduced = X_train_reduced.transpose()
  X_train_reduced.reset_index(drop = True, inplace = True)
  X_test_reduced = pd.DataFrame(X_test, columns=X_train_reduced.columns)

  model = reduce_and_model(X_train_reduced, X_test_reduced, Y_train, Y_test, reduce_choice, scalerY, 'no', modeling_choice)

  return model #Return the model for each fold

# **PPG**

**Lasso Folds:**
*   AVG training Normalized RMSE: 0.03
*   AVG training Normalized MAE: 0.03
*   Test Normalized RMSE: 0.44
*   Test Normalized MAE: 0.05
*   AVG of avg inv transformed train error from folds: 0.7
*   AVG inv transformed test error: 1.5
*   Range of predictions (inv transformed): 25.2

**Lasso Final:** - .49 MMS, 53:
*   Best trial:
  Params: {'alpha': 0.010000190200792278, 'max_iter': 2600, 'tol': 0.0022029157857933754}
*   Normalized RMSE: 0.064
*   Normalized MAE: 0.055
*   avg inv transformed accuracy: 1.5
*   Range of predictions (inv transformed): 21.7

In [None]:
X = inputs
y = playoff_stats['Pts/GM_Playoffs']#Pts/GM_Playoffs, oPts/GM_Playoffs
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.49, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(53)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [None]:
PPG_predict = pd.read_csv('PPG_Inputs_b4_chip1.csv')

PPG_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
trained_features_to_scale = X[PPG_predict.columns]
trained_features_scaled = pd.DataFrame(PPG_scaler.fit_transform(trained_features_to_scale), columns = trained_features_to_scale.columns)

PPG_predict_scaled = pd.DataFrame(PPG_scaler.transform(PPG_predict), columns = PPG_predict.columns)
PPG_playoffs_scaled = pd.Series(target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten())

In [None]:
PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS'] = PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Seed_Playoffs*SOS'] = PPG_predict_scaled['Seed_Playoffs'] * PPG_predict_scaled['SOS']
PPG_predict_scaled['Seed_Playoffs*SOS_Opp_Ch'] = PPG_predict_scaled['Seed_Playoffs'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off*SOS'] = PPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off'] * PPG_predict_scaled['SOS']
PPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off*SOS_Opp_Ch'] = PPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off*SOS']

PPG_predict_scaled['National Leader Point Diff - Point Diff_Off*SOS_Opp_Ch'] = PPG_predict_scaled['National Leader Point Diff - Point Diff_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off*SOS_Opp_Ch'] = PPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off_Opp_Ch*SOS_Opp_Ch'] = PPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off_Opp_Ch'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs*SOS'] = PPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['PTS/GM Nat. Rank * FG% Nat. Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled['PTS/GM Nat. Rank * FG% Nat. Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['PTS/AST_Off_Opp_Ch*SOS_Opp_Ch'] = PPG_predict_scaled['PTS/AST_Off_Opp_Ch'] * PPG_predict_scaled['SOS']

PPG_predict_scaled[' PTS/GM Conference Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled[' PTS/GM Conference Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['PTS/GM*FG%_Off*SOS'] = PPG_predict_scaled['PTS/GM*FG%_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['oPPG/SOS_Opp_1st*SOS'] = PPG_predict_scaled['oPPG/SOS_Opp_1st'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch*SOS_Opp_Ch'] = PPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Conference Leader FG% - FG%_Off*SOS_Opp_Ch'] = PPG_predict_scaled['Conference Leader FG% - FG%_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS'] = PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs'] * PPG_predict_scaled['SOS']
PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS_Opp_Ch'] = PPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS']

PPG_predict_scaled['PTS/GM Nat. Rank * 3PM/GM Nat. Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled['PTS/GM Nat. Rank * 3PM/GM Nat. Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['AST/GM National Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled['AST/GM National Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled[' AST/GM Conference Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled[' AST/GM Conference Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['PTS/GM National Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled['PTS/GM National Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Conference Leader FT% - FT%_Off_Opp_Ch*SOS'] = PPG_predict_scaled['Conference Leader FT% - FT%_Off_Opp_Ch'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['Point Diff/GM_bfour_Ch_Playoffs*SOS'] = PPG_predict_scaled['Point Diff/GM_bfour_Ch_Playoffs'] * PPG_predict_scaled['SOS']
PPG_predict_scaled['Point Diff/GM_bfour_Ch_Playoffs*SOS_Opp_Ch'] = PPG_predict_scaled['Point Diff/GM_bfour_Ch_Playoffs'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['FG% National Rank_Off*SOS_Opp_Ch'] = PPG_predict_scaled['FG% National Rank_Off'] * PPG_predict_scaled['SOS']

PPG_predict_scaled['2P%*AST/GM_Off_Opp_Ch*SOS'] = PPG_predict_scaled['2P%*AST/GM_Off_Opp_Ch'] * PPG_predict_scaled['SOS']

In [None]:
PPG_model = joblib.load('PPG_Lasso_.pkl')

trained_columns = PPG_model.feature_names_in_
PPG_predict_scaled = PPG_predict_scaled[trained_columns]
PPG_predictions = PPG_model.predict(PPG_predict_scaled)

In [None]:
PPG_predictions = target_scaler.inverse_transform(PPG_predictions.reshape(-1, 1))
PPG_predictions = pd.Series(PPG_predictions.flatten())
PPG_predictions.values

array([81.80213449, 71.77994519])

# **oPPG**

**Lasso Folds:**
*   AVG training Normalized RMSE: 0.09
*   AVG training Normalized MAE: 0.07
*   Test Normalized RMSE: 0.34
*   Test Normalized MAE: 0.08
*   AVG of avg inv transformed train error from folds: 1.2
*   AVG inv transformed test error: 1.4
*   Range of predictions (inv transformed): 13.0

**Lasso Final:**
*   Best trial:
  Params: {'alpha': 0.010001255702060242, 'max_iter': 2700, 'tol': 0.00044933505675711976}
*   Normalized RMSE: 0.091
*   Normalized MAE: 0.070
*   avg inv transformed accuracy: 1.2
*   Range of predictions (inv transformed): 13.2

In [None]:
X = inputs
y = playoff_stats['oPts/GM_Playoffs']#Pts/GM_Playoffs, oPts/GM_Playoffs
len_df = len(X)
train_index = list(range(len_df))
test_index = list(range(1))

X_train, X_test, scalerPPG, scalerY, Y_train, Y_test, correlations_df = get_inputs(X, y, train_index, test_index[0], "MMS", 0.4, 'yes')
common_columns = list(set(X_train.columns).intersection(correlations_df.columns))
X_train_with_corrs = pd.concat([X_train[common_columns], correlations_df[common_columns]])
X_train_with_corrs = X_train_with_corrs.transpose()
X_train_with_corrs = X_train_with_corrs.sort_values(by='corrs', ascending = False, key=abs)
X_train_with_corrs = X_train_with_corrs.head(53)
X_train_reduced = X_train_with_corrs.drop('corrs', axis = 1) #drop corrs column
X_train_reduced = X_train_reduced.transpose()
X_train_reduced.reset_index(drop = True, inplace = True)

In [None]:
oPPG_predict = pd.read_csv('oPPG_Inputs_b4_chip.csv')

oPPG_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
trained_features_to_scale = X[oPPG_predict.columns]
trained_features_scaled = pd.DataFrame(oPPG_scaler.fit_transform(trained_features_to_scale), columns = trained_features_to_scale.columns)

oPPG_predict_scaled = pd.DataFrame(oPPG_scaler.transform(oPPG_predict), columns = oPPG_predict.columns)
oPPG_playoffs_scaled = pd.Series(target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten())

In [None]:
oPPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs*SOS'] = oPPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs*SOS_Opp_Ch'] = oPPG_predict_scaled['oPts/GM_bfour_Ch_Playoffs*SOS']

oPPG_predict_scaled['National Leader FG% - FG%_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['National Leader FG% - FG%_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['National Leader FG% - FG%_Off_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['National Leader FG% - FG%_Off_Opp_Ch*SOS']

oPPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS'] = oPPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS_Opp_Ch'] = oPPG_predict_scaled['Pts/GM_bfour_Ch_Playoffs*SOS']

oPPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['AST/GM National Rank_Off_Opp_Ch*SOS']

oPPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off*SOS'] = oPPG_predict_scaled['Conference Leader PTS/GM - PTS/GM_Off'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['FGA/AST_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['FGA/AST_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled[' FG% Conference Rank_Off*SOS_Opp_Ch'] = oPPG_predict_scaled[' FG% Conference Rank_Off'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['2PM/GM*2P%_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['2PM/GM*2P%_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['2PM/GM*2P%_Off_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['2PM/GM*2P%_Off_Opp_Ch*SOS']

oPPG_predict_scaled['STL/GM Conference Rank_Def_Opp_Ch*SOS'] = oPPG_predict_scaled['STL/GM Conference Rank_Def_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['STL/GM Conference Rank_Def_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['STL/GM Conference Rank_Def_Opp_Ch*SOS']

oPPG_predict_scaled['3P%*AST/3PA_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['3P%*AST/3PA_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['Conference Leader AST/GM - AST/GM_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled[' PTS/GM Conference Rank_Off*SOS'] = oPPG_predict_scaled[' PTS/GM Conference Rank_Off'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['PORPAG_BP/Team GM_Opp_Ch*SOS'] = oPPG_predict_scaled['PORPAG_BP/Team GM_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['FG% National Rank_Off_Opp_Ch*SOS'] = oPPG_predict_scaled['FG% National Rank_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['oPTS/TO_Def_Opp_Ch*SOS'] = oPPG_predict_scaled['oPTS/TO_Def_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['oPTS/TO_Def_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['oPTS/TO_Def_Opp_Ch*SOS']

oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM*SOS'] = oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM*SOS_Opp_Ch'] = oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM*SOS']

oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch*SOS'] = oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch'] * oPPG_predict_scaled['SOS']
oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['ThreePointFieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch*SOS']

oPPG_predict_scaled['AST/FGA * FG%_Off_Opp_Ch*SOS_Opp_Ch'] = oPPG_predict_scaled['AST/FGA * FG%_Off_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['FreeThrows Made/Team GM_BP/Team GM_Opp_Ch*SOS'] = oPPG_predict_scaled['FreeThrows Made/Team GM_BP/Team GM_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['FieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch*SOS'] = oPPG_predict_scaled['FieldGoals Attempted/Team GM_BP/Team GM_Opp_Ch'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['oPTS/oFTM_Def*SOS_Opp_Ch'] = oPPG_predict_scaled['oPTS/oFTM_Def'] * oPPG_predict_scaled['SOS']

oPPG_predict_scaled['FieldGoals Pct/MP_BP/MP*SOS'] = oPPG_predict_scaled['FieldGoals Pct/MP_BP/MP'] * oPPG_predict_scaled['SOS']

In [None]:
oPPG_model = joblib.load('oPPG_Lasso_.pkl')

trained_columns = oPPG_model.feature_names_in_
oPPG_predict_scaled = oPPG_predict_scaled[trained_columns]
oPPG_predictions = oPPG_model.predict(oPPG_predict_scaled)

In [None]:
oPPG_predictions = target_scaler.inverse_transform(oPPG_predictions.reshape(-1, 1))
oPPG_predictions = pd.Series(oPPG_predictions.flatten())
oPPG_predictions.values

array([69.47963684, 61.02079017])