<a href="https://colab.research.google.com/github/ConorD28/Baseball/blob/main/MLB_Playoffs_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
%matplotlib inline
df_NP = pd.read_csv('inputs.csv')
df = pd.read_csv('playoff_stats.csv')

df_NP.isnull().sum().sum() #Check if there are NA values
df.isnull().sum().sum()

# **Correlation/Scores**

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  i = 0
  for col in dataset.columns:
      cor2 = dataset.iloc[:,i].corr(target) #scipy.stats.spearmanr(x, y)[0] and scipy.stats.kendalltau(x, y)[0]
      column_headers = list(dataset.columns.values)
      if(abs(cor2) > threshold):
        data.append(dataset.iloc[:,i]) #make list of columns that meet the threshold
        print(col)
        print(round(cor2,3))
      i = i + 1
  return data

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred, y_full):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)
  Normalized_RMSE = (np.sqrt(MSE)/np.mean(y_full))*100
  Normalized_MAE = (MAE/np.mean(y_full))*100
  Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
  avg_error = (sum(abs(y_pred-y)))/len(y_pred)
  print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}%')
  #print(f'Normalized RMSE:{ Normalized_RMSE:.1f}%')
  #print(f'Normalized MAE:{ Normalized_MAE:.2f}%')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  print(f'Avg. Error:{avg_error:.4f}')
  #print(y_pred-y)
  return Avg_Normalized_Score

# **ML Algorithms**

In [None]:
def RLE_Model(X, y, choice, predict_df): #Function to run Ridge, Lasso, or ElasticNet model
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]

  if(choice=="Ridge"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(RidgeCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Lasso"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(LassoCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Elastic"):
    l1_ratio = [0, 0.3, 0.5, 0.7, 0.9, 1]
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(ElasticNetCV(alphas=alphas, l1_ratio=l1_ratio, max_iter=100000))
    pipeline.fit(X_train, y_train)

  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')

  #Calculate the predicted values:
  y_train_pred = pipeline.predict(X_train)
  #print(y_train_pred)
  print()

  y_test_pred = pipeline.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #print(sum(abs(y_test_pred - y_test))/10)

  #Predict:
  predictions = pipeline.predict(predict_df)

  return y_test_pred, y_train_pred, predictions

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def GBR_model(X,y, t, l, n, predict_df):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  reg = GradientBoostingRegressor(tol = t, learning_rate = l, n_estimators=n, random_state=0) #default: tol = 0.0001, learning rate - 0.1, 100, friedman_mse
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  return y_test_pred, y_train_pred, predictions

In [None]:
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb
import pickle

def BR_model(X,y, choice):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  if choice == "Boost":
    reg = BaggingRegressor(estimator=xgb.XGBRegressor()) #
  else:
    reg = BaggingRegressor()
  #reg = pickle.load(open("BR_model Per 100", "rb"))
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  #predictions = reg.predict(predict_df)
  #print(predictions)

  pickle.dump(reg, open("BR_model RA_Batter2", "wb"))
  return y_test_pred, y_train_pred

In [None]:
from sklearn.ensemble import RandomForestRegressor

def RF_model(X,y, criterion, trees):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  if criterion == "aboslute":
    reg = RandomForestRegressor(n_estimators = trees, criterion = "absolute_error") #
  elif criterion == "friedman":
    reg = RandomForestRegressor(n_estimators = trees, criterion = "friedman_mse") #
  elif criterion == "poisson":
    reg = RandomForestRegressor(n_estimators = trees, criterion = "poisson") #
  else:
    reg = RandomForestRegressor(n_estimators = trees)
  #reg = pickle.load(open("BR_model Per 100", "rb"))
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  #predictions = reg.predict(predict_df)
  #print(predictions)

  pickle.dump(reg, open("RF_model RA_Batter", "wb"))
  return y_test_pred, y_train_pred

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#SGD Regressor:
def SGD_model(X,y, t, ep):

  reg = make_pipeline(SGDRegressor(max_iter=1000, tol=t, epsilon = ep)) #tol = 0.001, epsilon=0.1
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #predictions = reg.predict(predict_df)
  #print(predictions)
  pickle.dump(reg, open("SGD_model RA_Batter", "wb"))
  return y_test_pred, y_train_pred

In [None]:
#Keras Sequential Neural Net
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=2)

def Keras_model(X,y,e, u, u2, u3, u4, u5, choice):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  model = Sequential()
  model.add(Dense(units=u, input_dim=X_train.shape[1], activation='relu')) # Hidden 1, 60
  model.add(Dense(units=u2,activation='relu')) # Hidden 2, 30
  model.add(Dense(units=u3,activation='relu'))
  model.add(Dense(units=u4,activation='relu'))
  model.add(Dense(units=u5,activation='relu'))
  model.add(Dense(units=15,activation='relu')) #15
  model.add(Dense(units=1)) #,activation='relu'
  model.compile(loss='mean_squared_error', optimizer=choice) #
  model.fit(X_train, y_train, verbose=0, epochs=e, callbacks=[early_stop]); #callbacks=[early_stop]

  y_train_pred = model.predict(X_train)
  y_train_pred = y_train_pred.flatten()

  y_test_pred = model.predict(X_test)
  y_test_pred = y_test_pred.flatten()

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  model.save('/content/drive/MyDrive/Models/Keras_Model.h5')

In [None]:
from sklearn.tree import DecisionTreeRegressor

def DTR_model(X,y,leafs):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  # We introduce regularization by increasing the value of min_samples_leaf
  tree_reg_regularized = DecisionTreeRegressor(random_state=42, min_samples_leaf=leafs)
  tree_reg_regularized.fit(X_train, y_train)
  y_train_pred = tree_reg_regularized.predict(X_train) #_regularized
  #print(y_train_pred)

  y_test_pred = tree_reg_regularized.predict(X_test) #_regularized

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import LinearSVR

def SVM_model(X,y,ep, predict_df):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  svm_reg = LinearSVR(epsilon=ep, random_state=42) #default: epsilon = 0 tol=0.0001, C=1.0
  svm_reg.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = svm_reg.predict(X_train)
  #print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  y_test_pred = svm_reg.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_preds = svm_reg.predict(predict_df)

  return y_test_pred, y_train_pred, final_preds

In [None]:
from sklearn.svm import SVR

def SVM_models(X,y, choice, ep, C_value, predict_df):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]

  if(choice=="rbf"):
    model = SVR(kernel="rbf", C=C_value, gamma=0.1, epsilon=ep) #0.1 default ep; 100 default C, 0.1 default gamma
    model.fit(X_train, y_train)

  if(choice=="poly"):
    model = SVR(kernel="poly", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  if(choice == "linear"):
    model = SVR(kernel="linear", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = model.predict(X_train)
  #print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  y_test_pred = model.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.2f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_preds = model.predict(predict_df)

  return y_test_pred, y_train_pred, final_preds

# **Inputs**

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
def get_inputs(data_frame, full_data_frame, y_col, scaler_choice, thresh):
#Feature Importance:
  if scaler_choice == "MinMax":
    scaler = MinMaxScaler()
  else:
    scaler = StandardScaler()

  data_scaled = pd.DataFrame(scaler.fit_transform(data_frame[10:30]), columns = data_frame.columns)
  df_20 = full_data_frame[10:30] #for training target
  data_correlated = correlation(data_scaled, thresh, df_20[y_col]) #.115 lowest for All csv
  data_correlated_df = pd.DataFrame(data_correlated)
  data_correlated_df2 = data_correlated_df.transpose() #Correlated inputs

#Train test split and scale:
  X = data_frame.loc[:, data_correlated_df2.columns] #get non scaled data with important features
  X_train, X_test = X[10:30], X[0:10]
  X_train_processed = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
  X_test_processed = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
  correlated_scaled_data = pd.merge(X_test_processed, X_train_processed, how = 'outer')

  print(len(X.columns))
  return correlated_scaled_data, scaler

In [None]:
def reduce_df(data, reduction_choice):
  X_train, X_test = data[10:30], data[0:10]

  pca=PCA(n_components = 2, random_state=0)
  lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5, random_state=42)

  if reduction_choice == "PCA":
    #pca=PCA(n_components = 2, random_state=0) #n_components = None, 2
    X_train_PCA = pca.fit_transform(X_train)
    X_train_PCA_df = pd.DataFrame(X_train_PCA)
    X_test_PCA = pca.transform(X_test)
    X_test_PCA_df = pd.DataFrame(X_test_PCA)
    data_PCA = pd.merge(X_test_PCA_df, X_train_PCA_df, how = 'outer')
    X = data_PCA
    print("Principal axes:\n", pca.components_.tolist())
    print("Explained variance:\n", pca.explained_variance_.tolist())
    print("Mean:", pca.mean_)

  else:
    #lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5, random_state=42) #n_components=2 is default, neighbors 5 is default
    X_unrolled_train = lle.fit_transform(X_train)
    X_train_LLE_df = pd.DataFrame(X_unrolled_train)
    X_unrolled_test = lle.transform(X_test)
    X_test_LLE_df = pd.DataFrame(X_unrolled_test)
    data_LLE = pd.merge(X_test_LLE_df, X_train_LLE_df, how = 'outer')
    X = data_LLE

  return X, pca, lle

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
df_NP = df_NP.apply(pd.to_numeric, errors='coerce')

In [None]:
inputs_MMS, scaler_RA_Batter_MMS = get_inputs(df_NP, df, "RA/BatterPitching/More In Playoffs", "MinMax", .855) #MinMax,

In [None]:
inputs, scaler_RA_Batter = get_inputs(df_NP, df, "RA/BatterPitching/More In Playoffs", "", .855)

In [None]:
inputs_MMS_PCA = reduce_df(inputs_MMS, "PCA")[0]

In [None]:
inputs_PCA = reduce_df(inputs, "PCA")[0]

In [None]:
inputs_LLE = reduce_df(inputs, "")[0]

In [None]:
inputs_MMS_LLE = reduce_df(inputs_MMS, "")[0]

In [None]:
Y = df["RA/BatterPitching/More In Playoffs"] #RA/GMPitching/More In Playoffs

# **Models**

In [None]:
RLE_Model(inputs, Y, "Ridge", inputs)
RLE_Model(inputs, Y, "Lasso", inputs)
SGD_model(inputs, Y, 1e-3, 0.1) #
print("GBR and DTR:")
GBR_model(inputs, Y, .0001, 0.01, 100, inputs) #
DTR_model(inputs, Y, 100) #
print("Random Forest:")
RF_model(inputs, Y, "absolute", 100) #
RF_model(inputs, Y, "friedman", 100) #
RF_model(inputs, Y, "poisson", 100) #
RF_model(inputs, Y, "", 100) #

In [None]:
SVM_models(inputs, Y, "rbf", .1, 100, inputs) #
SVM_models(inputs, Y, "poly", .1, 100, inputs) #
SVM_models(inputs, Y, "linear", .1, 100, inputs)
SVM_model(inputs,Y, 0.0, inputs) #

In [None]:
RLE_Model(inputs, Y, "Elastic", inputs) #

In [None]:
BR_model(inputs,Y, "Boost") #
BR_model(inputs,Y, "") #

inputs2 = inputs.astype(np.float32)
Y2 = Y.astype(np.float32)
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adamax") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adam") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "nadam") #

**PCA:**

In [None]:
RLE_Model(inputs_PCA, Y, "Ridge", inputs_PCA) #
RLE_Model(inputs_PCA, Y, "Lasso", inputs_PCA) #
SGD_model(inputs_PCA, Y, 1e-3, 0.1) #
print("GBR and DTR:")
GBR_model(inputs_PCA, Y, .0001, 0.01, 100, inputs_PCA) #
DTR_model(inputs_PCA, Y, 100) #
print("Random Forest:")
RF_model(inputs_PCA, Y, "absolute", 100) #
RF_model(inputs_PCA, Y, "friedman", 100) #
RF_model(inputs_PCA, Y, "poisson", 100) #
RF_model(inputs_PCA, Y, "", 100) #

In [None]:
SVM_models(inputs_PCA, Y, "rbf", 0.01, 100, inputs_PCA)

In [None]:
SVM_models(inputs_PCA, Y, "rbf", .1, 100, inputs_PCA) #
SVM_models(inputs_PCA, Y, "poly", .1, 100, inputs_PCA) #
SVM_models(inputs_PCA, Y, "linear", .1, 100, inputs_PCA) #
SVM_model(inputs_PCA,Y, 0.0, inputs_PCA) #

In [None]:
RLE_Model(inputs_PCA, Y, "Elastic", inputs_PCA) #

In [None]:
BR_model(inputs_PCA,Y, "Boost")
BR_model(inputs_PCA,Y, "")

inputs2 = inputs_PCA.astype(np.float32)
Y2 = Y.astype(np.float32)
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adamax") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adam") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "nadam") #

**LLE:**

In [None]:
RLE_Model(inputs_LLE, Y, "Ridge", inputs_LLE) #
RLE_Model(inputs_LLE, Y, "Lasso", inputs_LLE) #
SGD_model(inputs_LLE, Y, 1e-3, 0.1) #
print("GBR and DTR:")
GBR_model(inputs_LLE, Y, .0001, 0.01, 100, inputs_LLE) #
DTR_model(inputs_LLE, Y, 100) #
print("Random Forest:")
RF_model(inputs_LLE, Y, "absolute", 100) #
RF_model(inputs_LLE, Y, "friedman", 100) #
RF_model(inputs_LLE, Y, "poisson", 100) #
RF_model(inputs_LLE, Y, "", 100) #

In [None]:
SVM_models(inputs_LLE, Y, "rbf", .01, 100, inputs_LLE) #
SVM_models(inputs_LLE, Y, "poly", .04, 100, inputs_LLE) #
SVM_models(inputs_LLE, Y, "linear", .01, 100, inputs_LLE) #
SVM_model(inputs_LLE,Y, 0.01, inputs_LLE) #

In [None]:
RLE_Model(inputs_LLE, Y, "Elastic", inputs_LLE) #

In [None]:
BR_model(inputs_LLE,Y, "Boost") #
BR_model(inputs_LLE,Y, "") #

inputs2 = inputs_LLE.astype(np.float32)
Y2 = Y.astype(np.float32)
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adamax") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "adam") #
Keras_model(inputs2, Y2, 200, 120, 60, 30, 20, 15, "nadam") #

# **Blenders/Predictions:**

In [None]:
#RA/Batter Blender: #11.5, 4.8 - 13.9, .097 avg error, .855 to get 59 cols
loaded_BR_model = pickle.load(open("BR_model RA_Batter", "rb")) #
loaded_BR_model2 = pickle.load(open("BR_model RA_Batter2", "rb")) #

RA_Batter = pd.read_csv('RA_Batter.csv')
RA_Batter_scaled = scaler_RA_Batter.transform(RA_Batter)
RA_Batter_scaled_MMS = scaler_RA_Batter_MMS.transform(RA_Batter)
RA_Batter_PCA = (reduce_df(inputs, "PCA")[1]).transform(RA_Batter_scaled)
RA_Batter_PCA_MMS = (reduce_df(inputs_MMS, "PCA")[1]).transform(RA_Batter_scaled_MMS)

preds = (loaded_BR_model2.predict(inputs_MMS_PCA[0:10]), loaded_BR_model2.predict(inputs_MMS_PCA[10:30]),
         loaded_BR_model2.predict(RA_Batter_PCA_MMS)) #11.4, 4.7, MMS - 13.8
preds2 = SVM_models(inputs_PCA, Y, "rbf", 0.01, 100, RA_Batter_PCA) #12.3, 3.7 - 14.2
preds3 = (loaded_BR_model.predict(inputs_MMS_PCA[0:10]), loaded_BR_model.predict(inputs_MMS_PCA[10:30]),
          loaded_BR_model.predict(RA_Batter_PCA_MMS)) #11.4, 5.8, MMS - 14.3
#preds4 = RLE_Model(inputs, Y, "Ridge", RA_Batter_scaled) #14.1, .3  -14.3
#preds5 = RLE_Model(inputs_MMS_PCA, Y, "Lasso", RA_Batter_PCA_MMS)#14.5, .3, MMS - 14.7
#preds6 = RLE_Model(inputs_PCA, Y, "Lasso", inputs_PCA) #14.6, .5 - 14.9
#preds7 = RLE_Model(inputs_MMS, Y, "Lasso", inputs_MMS) #15, 2.3, MMS - 16.2

train_preds = (preds[1] + preds2[1] + preds3[1])/3

test_preds = (preds[0] + preds2[0] + preds3[0])/3

RA_Batter_preds_2024 = (preds[2] + preds2[2] + preds3[2])/3

print("Blender Train Scores then Test Scores:")
Scores(Y[10:30], train_preds, Y)
print()
Scores(Y[0:10], test_preds, Y)
print()
print("2024 RA/Batter Predictions:")
print(RA_Batter_preds_2024)
RA_Batter_preds_2024_df = pd.DataFrame(RA_Batter_preds_2024)
RA_Batter_preds_2024_df.to_excel("2024 RA_Batter Predictions.xlsx")

Principal axes:
 [[0.14375503761905387, 0.08900768957737851, 0.14381144218535968, 0.13053624430582864, 0.08478513952922558, 0.14184531236925382, 0.10628508626927799, 0.1280643235682248, 0.08987502027116795, 0.1442246016261102, 0.13098085378781363, 0.08568499048682596, 0.14226020579654464, 0.15013431960709492, 0.1377849733866141, 0.14850430576183685, 0.1345110829602761, 0.1070008951007915, 0.1294140707621743, 0.12857734089733824, 0.13703444753445881, 0.13441092503411728, 0.10928606964996353, 0.10866254914064576, 0.14005091988237145, 0.11764024460166357, 0.1292233049820064, 0.13656376488872463, 0.15272309226679467, 0.12746467749121693, 0.11418440901862337, 0.13370134742178522, 0.15105083939882963, 0.12821947417266188, 0.14065957214196073, 0.12293492362081027, 0.13241903449632172, 0.12772698983938433, 0.1565853035400834, 0.12008925912160485, 0.13017706456626016, 0.1270503480544868, 0.15513916612329431, 0.12725355387047588, 0.13574738515604978, 0.13300739705258913, 0.10866254919795368, 0.1



In [None]:
#R/PA Blender: 8.4, 2, avg. error: 0.0090, top 6, .845 to get 60 cols - 9.4
#loaded_RF_model = pickle.load(open("RF_model R_PA", "rb")) #
#loaded_RF_model2 = pickle.load(open("RF_model R_PA2", "rb")) #
loaded_RF_model3 = pickle.load(open("RF_model R_PA3", "rb")) #
loaded_RF_model4 = pickle.load(open("RF_model R_PA4", "rb")) #
loaded_BR_model = pickle.load(open("BR_model R_PA", "rb")) #
loaded_BR_model2 = pickle.load(open("BR_model R_PA2", "rb")) #

R_PA = pd.read_csv('R_PA_inputs.csv')
R_PA_scaled = scaler_R_PA.transform(R_PA)
R_PA_scaled_MMS = scaler_R_PA_MMS.transform(R_PA)
R_PA_PCA = (reduce_df(inputs, "PCA")[1]).transform(R_PA_scaled)
R_PA_LLE = (reduce_df(inputs, "")[2]).transform(R_PA_scaled)
R_PA_LLE_MMS = (reduce_df(inputs_MMS, "")[2]).transform(R_PA_scaled_MMS)

preds = (loaded_BR_model.predict(inputs_MMS[0:10]), loaded_BR_model.predict(inputs_MMS[10:30]),
         loaded_BR_model.predict(R_PA_scaled_MMS)) #9.2, 1.9 - 10.2 (plain)
preds2 = GBR_model(inputs_LLE, Y, .0001, 0.01, 100, R_PA_LLE) #9.7, 2.1 - 10.8
preds3 = (loaded_RF_model3.predict(inputs_LLE[0:10]), loaded_RF_model3.predict(inputs_LLE[10:30]),
          loaded_RF_model3.predict(R_PA_LLE)) #9.3, 3.3 - 11 (plain)
preds4 = (loaded_RF_model4.predict(inputs_LLE[0:10]), loaded_RF_model4.predict(inputs_LLE[10:30]),
          loaded_RF_model4.predict(R_PA_LLE)) #9.5, 3.6 - 11.3 (poisson)
preds5 = (loaded_BR_model2.predict(inputs[0:10]), loaded_BR_model2.predict(inputs[10:30]),
          loaded_BR_model2.predict(R_PA_scaled)) #10.1, 2.8 - 11.5 (Boost)
preds6 = GBR_model(inputs_PCA, Y, .0001, 0.01, 100, R_PA_PCA) #10.2, 2.9 - 11.7
#preds7 = (loaded_RF_model.predict(inputs_MMS[0:10]), loaded_RF_model.predict(inputs_MMS[10:30])) #10.2, 3.9, MMS - 12.2 (abs or poisson)#
#preds8 = (loaded_RF_model2.predict(inputs_MMS[0:10]), loaded_RF_model2.predict(inputs_MMS[10:30])) #10.2, 4.1, MMS - 12.3 (abs or poisson)
#preds9 = RLE_Model(inputs_MMS, Y, "Elastic", inputs_MMS) #11.2, 2.6, MMS - 12.5
#preds10 = RLE_Model(inputs_MMS_PCA, Y, "Elastic", inputs_MMS_PCA) #11.3, 2.5, MMS - 12.6

train_preds = (preds[1] + preds2[1] + preds3[1] + preds4[1] + preds5[1] + preds6[1])/6

test_preds = (preds[0] + preds2[0] + preds3[0] + preds4[0] + preds5[0] + preds6[0])/6

R_PA_preds_2024 = (preds[2] + preds2[2] + preds3[2] + preds4[2] + preds5[2] + preds6[2])/6

print("Blender Train Scores then Test Scores:")
Scores(Y[10:30], train_preds, Y)
print()
Scores(Y[0:10], test_preds, Y)
print()
print("2024 RA/GM Predictions:")
print(R_PA_preds_2024)
R_PA_preds_2024_df = pd.DataFrame(R_PA_preds_2024)
R_PA_preds_2024_df.to_excel("2024 R_PA Predictions.xlsx")

Principal axes:
 [[0.07237482520113063, 0.06514312738305279, 0.07381719597320083, 0.07432620564417702, 0.0672207841239294, 0.07952315767037843, 0.11200479552159832, 0.13829796327052316, 0.14061597360048073, 0.13588421644327958, 0.12355302305804197, 0.13912302603588578, 0.13411430368859897, 0.1339751814328834, 0.14583931530258784, 0.1251094654585108, 0.13112464369879973, 0.08816204286497996, 0.13020262822110037, 0.11819463631884258, 0.13111618253250967, 0.1345457746026534, 0.1322313756971539, 0.1472380640477702, 0.1270081997725471, 0.1249736730592046, 0.14668570703234063, 0.15011702489974463, 0.14900004922727958, 0.14510595259660664, 0.15029323644850717, 0.1338563709769622, 0.1379376437110802, 0.12450167168804427, 0.11534906941013527, 0.14728792264434443, 0.1454804075737005, 0.14187096115391135, 0.14649047896439035, 0.13498973932349095, 0.13881546960697794, 0.1151047347971632, 0.1263345480912702, 0.13658730363633104, 0.138933002521503, 0.134826042812561, 0.12294367130433052, 0.137448073



Avg. Normalized Score:7.6%
Avg. Error:0.0078
Test predictions:
[0.12723086 0.12326176 0.12723086 0.11645235 0.11394414 0.14629652
 0.12723086 0.12353472 0.13751158 0.11394414]
Avg. Normalized Score:9.7%
Avg. Error:0.0108
Difference of avg scores:2.08%





Avg. Normalized Score:7.2%
Avg. Error:0.0076
Test predictions:
[0.13577919 0.11133737 0.13577919 0.11133737 0.13577919 0.13577919
 0.11613258 0.11613258 0.1456435  0.1147916 ]
Avg. Normalized Score:10.2%
Avg. Error:0.0103
Difference of avg scores:2.92%

Blender Train Scores then Test Scores:
Avg. Normalized Score:6.4%
Avg. Error:0.0064

Avg. Normalized Score:8.4%
Avg. Error:0.0090

2024 RA/GM Predictions:
[0.13326052 0.11945574 0.11729997 0.11686141 0.11271409 0.11774292
 0.13344949 0.12227888 0.11648336 0.13031987 0.12963305 0.11882093]


In [None]:
#RA/GM Blender: 13.6, .7, avg. error: .4, .855 to get 60 cols for RA/GM for standard - 14.2
inputs2 = inputs
inputs2_PCA = inputs_PCA
inputs2_LLE = inputs_LLE

RA_GM = pd.read_csv('RA_GM.csv')
RA_GM_scaled = scaler_RA_GM.transform(RA_GM)
RA_GM_scaled_MMS = scaler_RA_GM_MMS.transform(RA_GM)
RA_GM_PCA = (reduce_df(inputs2, "PCA")[1]).transform(RA_GM_scaled)
RA_GM_LLE_MMS = (reduce_df(inputs_MMS, "")[2]).transform(RA_GM_scaled_MMS)

#loaded_SGD_model = pickle.load(open("SGD_model RA_GM", "rb")) #
#loaded_SGD_model2 = pickle.load(open("SGD_model RA_GM2", "rb"))

preds = SVM_models(inputs2_PCA, Y, "rbf", .6, 100, RA_GM_PCA) #13.7, .4, .6 - 13.9
preds2 = RLE_Model(inputs_MMS, Y, "Lasso", RA_GM_scaled_MMS) #13.6, 1.9, MMS - 14.6
preds3 = SVM_models(inputs_MMS_LLE, Y, "linear", .9, 100, RA_GM_LLE_MMS) #14.5, .5, .9, MMS - 14.8
#preds4 = SVM_models(inputs_MMS_LLE, Y, "rbf", .9, 100, RA_GM_LLE_MMS) #14.5, .6, .9, MMS - 14.8
#preds5 = RLE_Model(inputs2, Y, "Lasso", RA_GM_scaled) #14.8, .8 - 15.2
#preds6 = RLE_Model(inputs2_PCA, Y, "Lasso", RA_GM_PCA) #15, .9 - 15.5
#preds7 = RLE_Model(inputs_PCA, Y, "Lasso", inputs_PCA) #14.9, 1.1, MMS - 15.5
#preds8 = (loaded_SGD_model2.predict(inputs_LLE_10_15[0:10]), loaded_SGD_model2.predict(inputs_LLE_10_15[10:30])) #15.5, .2, MMS, adjs: 10, 15, - 15.6
#preds9 = (loaded_SGD_model.predict(inputs2_LLE_10_15[0:10]), loaded_SGD_model.predict(inputs2_LLE_10_15[10:30])) #15.7, .4, adjs: 10, 15 - 15.9
#preds10 = RLE_Model(inputs2_LLE, Y, "Elastic", inputs_LLE) #15.4, 1.2 - 16

train_preds = (preds[1] + preds2[1])/2

test_preds = (preds[0] + preds2[0])/2

RA_GM_preds_2024 = (preds[2] + preds2[2])/2

print("Blender Train Scores then Test Scores:")
Scores(Y[10:30], train_preds, Y)
print()
Scores(Y[0:10], test_preds, Y)
print()
print("2024 RA/GM Predictions:")
print(RA_GM_preds_2024)
RA_GM_preds_2024_df = pd.DataFrame(RA_GM_preds_2024)
RA_GM_preds_2024_df.to_excel("2024 RA_GM Predictions.xlsx")

Principal axes:
 [[-0.0749310197257101, -0.07348300991999297, 0.13655845782264062, 0.14120154339359087, 0.1386949819012305, 0.13728544524917666, 0.14012530875872137, 0.14009458460531993, 0.1361369728256285, 0.1406111088563674, 0.13925669879342803, 0.07003426006384814, 0.13739578027406693, 0.14033504177640152, 0.14024862628322665, 0.13624339271285218, 0.14080961581298196, 0.13940250410837002, 0.13151702135010884, 0.13106028460846156, 0.13900527562295922, 0.1410108670092537, 0.1410342706609554, 0.13802022008376005, 0.1417001518160689, 0.14053681830397952, 0.0712359058416632, 0.10720152969161156, 0.09842559539243212, 0.11234589784014794, 0.14221969588998934, 0.07810317957254306, 0.1408511491302939, 0.14300878039899448, 0.1412567974196842, 0.07949646642882548, 0.14119893034828243, 0.14235625351040393, 0.13565101840680868, 0.0942182931154841, 0.13860043225534371, 0.14261719694912917, 0.14023049070580143, 0.14203369195096063, 0.13555423179802653, 0.13814817125439163, 0.14179395405649625, 0.1

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


Avg. Normalized Score:15.6%
Avg. Error:0.5105
Test predictions:
[3.38540024 3.42431679 3.41404403 3.47201129 3.40907784 3.71134451
 3.54003174 3.49538779 3.49942655 3.44941731]
Avg. Normalized Score:13.6%
Avg. Error:0.4504
Difference of avg scores:-1.94%

Avg. Normalized Score:15.1%
Avg. Error:0.4940
Test predictions:
[3.53039111 3.51097978 3.49051738 3.52981129 3.47307496 3.47649143
 3.63050779 3.49849966 3.49301577 3.47470326]
Avg. Normalized Score:14.5%
Avg. Error:0.4780
Difference of avg scores:-0.54%

Blender Train Scores then Test Scores:
Avg. Normalized Score:14.3%
Avg. Error:0.4778

Avg. Normalized Score:13.6%
Avg. Error:0.4441

2024 RA/GM Predictions:
[3.50679763 3.52230502 3.51173108 3.49155934 3.50436812 3.42158393
 3.5010521  3.47324401 3.55236621 3.42177794 3.3233635  3.49418503]




In [None]:
#R/GM Blender: 11, .8, avg error: .4, .83 thresh to get 61 cols - 11.4
R_GM = pd.read_csv('R_GM.csv')
R_GM_scaled = scaler_R_GM.transform(R_GM)
R_GM_PCA = (reduce_df(inputs, "PCA")[1]).transform(R_GM_scaled)

loaded_RF_model = pickle.load(open("RF_model R_GM", "rb")) #
#loaded_RF_model2 = pickle.load(open("RF_model R_GM2", "rb")) #
#loaded_SGD_model = pickle.load(open("SGD_model R_GM", "rb")) #12.4, 3.2 - 14
#loaded_SGD_model2 = pickle.load(open("SGD_model R_GM2", "rb")) #13.3, 2.6 - 14.6

preds = GBR_model(inputs_PCA, Y, .0001, 0.0055, 97, R_GM_PCA) #11.2, .1, .0055, 97 - 11.3
preds2 = (loaded_RF_model.predict(inputs[0:10]), loaded_RF_model.predict(inputs[10:30]), loaded_RF_model.predict(R_GM_scaled)) #10.7, 4.4, 98 - 12.9 (abs RF)
#preds3 = (loaded_RF_model2.predict(inputs[0:10]), loaded_RF_model2.predict(inputs[10:30])) #11.2, 4.6, 80 - 13.5 (standard RF)
#preds4 = SVM_models(inputs_LLE, Y, "poly", .5, 100, inputs_LLE) #13.5, .3, .5 - 13.7
preds5 = SVM_models(inputs, Y, "poly", 1.2, 100, R_GM_scaled) #13.3, 1.2, 1.2 - 13.9
#preds6 = (loaded_SGD_model.predict(inputs_PCA[0:10]), loaded_SGD_model.predict(inputs_PCA[10:30])) #12.4, 3.2 - 14, PCA
#preds7 = GBR_model(inputs, Y, .000, 0.0008, 200, inputs) #14, 0, 0.0008, 200 - 14
#preds8 = RLE_Model(inputs, Y, "Elastic", inputs) #12.9, 2.4 - 14.1
#preds9 = RLE_Model(inputs_MMS, Y, "Ridge", inputs_MMS) #13.2, 2.3, MMS - 14.4
#preds10 = SVM_models(inputs_LLE, Y, "rbf", .1, 100, inputs_LLE) #13.6, 1.5, .3 - 14.4

train_preds = (preds[1] + preds2[1] + preds5[1])/3

test_preds = (preds[0] + preds2[0] + preds5[0])/3

R_GM_preds_2024 = (preds[2] + preds2[2] + preds5[2])/3

print("Blender Train Scores then Test Scores:")
Scores(Y[10:30], train_preds, Y)
print()
Scores(Y[0:10], test_preds, Y)
print()
print("2024 R/GM Predictions:")
print(R_GM_preds_2024)
R_GM_preds_2024_df = pd.DataFrame(R_GM_preds_2024)
R_GM_preds_2024_df.to_excel("2024 R_GM Predictions.xlsx")

Principal axes:
 [[0.06897292284290153, -0.059744085603228025, 0.07577551361839686, 0.10047164700424138, 0.0814169430227213, 0.10372637886386546, -0.03670187278073265, 0.12234300642038247, 0.13271501630945506, 0.12182225207005858, 0.12320277617489492, 0.14669787531192746, 0.143929052073579, 0.09157284933570577, 0.15235718925000852, 0.13999499534882945, 0.11276414292530366, 0.12488786905684617, 0.1144141340995752, 0.11298192854890482, 0.1515290944539778, 0.14650754079090028, 0.1414453303018706, 0.15244483977240111, 0.1379996550349826, 0.11359359378645047, 0.12926786212709687, 0.134505481650397, 0.15152538786754105, 0.1421067894838163, 0.13260358820008264, 0.13660645136429164, 0.14440291860832646, 0.14687614434007418, 0.09644179442982644, 0.14001350477775412, 0.12896234182367353, 0.12770310272066052, 0.14485681127318747, 0.14678715951064666, 0.13935051516265623, 0.1282466318260486, 0.13193620486304172, 0.12037113567761011, 0.13095850876621282, 0.12056479767633299, 0.12127458399859299, 0.



Avg. Normalized Score:14.6%
Avg. Error:0.6069
Test predictions:
[5.37225343 4.24364327 5.40121143 4.48125383 4.75303746 5.14641749
 4.81631746 4.85922489 4.88337659 4.72117842]
Avg. Normalized Score:13.4%
Avg. Error:0.5458
Difference of avg scores:-1.22%

Blender Train Scores then Test Scores:
Avg. Normalized Score:10.2%
Avg. Error:0.4184

Avg. Normalized Score:11.0%
Avg. Error:0.4463

2024 R/GM Predictions:
[4.34414049 4.66383183 4.58015293 4.30554959 4.25560897 4.36764829
 4.87469559 4.18238712 4.31643173 4.51984075 4.37401902 4.26834602]




# **Other:**

In [None]:
#Check for collinearity
import seaborn as sns
#sns.pairplot(data_correlated_df2)

corr = data_correlated_df2.corr()
print(corr)

In [None]:
#Get Pearson's correlation between 2 variables
df_16.iloc[:,-3].corr(df_Playoffs_16.iloc[:,-2])

In [None]:
data_scaled_full = pd.DataFrame(scaler.fit_transform(df_NP), columns = df_NP.columns)
data_correlated = correlation(data_scaled_full, .3, df['P%_Playoffs']) #top 11 stats
data_correlated_df = pd.DataFrame(data_correlated)
data_correlated_df2 = data_correlated_df.transpose() #Correlated inputs
data_correlated_df2.columns

In [None]:
data_scaled_full

In [None]:
train_preds = (preds[1] + preds2[1] + preds3[1] + preds4[1] + preds5[1] + preds6[1] +
               preds7[1] + preds8[1] + preds9[1] + preds10[1] + preds11[1] + preds12[1] +
               preds13[1] + preds14[1] + preds15[1])/15

test_preds = (preds[0] + preds2[0] + preds3[0] + preds4[0] + preds5[0] + preds6[0] +
              preds7[0] + preds8[0] + preds9[0] + preds10[0] + preds11[0] + preds12[0] +
              preds13[0] + preds14[0] + preds15[0])/15