In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
%matplotlib inline
df = pd.read_csv('NHL Upload.csv')
df.drop(df.tail(1).index,
        inplace = True)
df_NP = df.drop(columns=df.columns[-12:], axis=1)

In [None]:
df_NP

In [None]:
df_NP.isnull().sum().sum() #Check if there are NA values

0

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  for i in range(len(dataset.columns)):
      cor2 = dataset.iloc[:,i].corr(target) #scipy.stats.spearmanr(x, y)[0] and scipy.stats.kendalltau(x, y)[0]
      column_headers = list(dataset.columns.values)
      if(abs(cor2) > threshold):
        data.append(dataset.iloc[:,i]) #make list of columns that meet the threshold
      i = i + 1
  return data

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred, y_full):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)
  Normalized_RMSE = (np.sqrt(MSE)/np.mean(y_full))*100
  Normalized_MAE = (MAE/np.mean(y_full))*100
  Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
  print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}%')
  #print(f'Normalized RMSE:{ Normalized_RMSE:.1f}%')
  #print(f'Normalized MAE:{ Normalized_MAE:.2f}%')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Avg_Normalized_Score

In [None]:
def RLE_Model(X, y, choice, predict_df): #Function to run Ridge, Lasso, or ElasticNet model
  #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) #Train/Test
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]

  if(choice=="Ridge"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(RidgeCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Lasso"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(LassoCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Elastic"):
    l1_ratio = [0, 0.3, 0.5, 0.7, 0.9, 1]
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(ElasticNetCV(alphas=alphas, l1_ratio=l1_ratio, max_iter=100000))
    pipeline.fit(X_train, y_train)

  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')

  #Calculate the predicted values:
  y_train_pred = pipeline.predict(X_train)
  #print(y_train_pred)
  print()

  y_test_pred = pipeline.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = pipeline.predict(predict_df)

  return y_test_pred, y_train_pred, predictions

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def GBR_model(X,y, t, l, n, predict_df):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  reg = GradientBoostingRegressor(tol = t, learning_rate = l, n_estimators=n, random_state=0) #default: tol = 0.0001, learning rate - 0.1, 100, friedman_mse
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  return y_test_pred, y_train_pred, predictions

In [None]:
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb
import pickle

def BR_model(X,y):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  reg = BaggingRegressor(base_estimator=xgb.XGBRegressor())
  #reg = pickle.load(open("BR_model Per 100", "rb"))
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  #predictions = reg.predict(predict_df)
  #print(predictions)

  pickle.dump(reg, open("BR_model P%", "wb"))
  return y_test_pred, y_train_pred

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#SGD Regressor:
def SGD_model(X,y, t, ep):

  reg = make_pipeline(SGDRegressor(max_iter=1000, tol=t, epsilon = ep)) #tol = 0.001, epsilon=0.1
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #predictions = reg.predict(predict_df)
  #print(predictions)

In [None]:
#Keras Sequential Neural Net
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=2)

def Keras_model(X,y,e, u, u2, u3, u4, u5):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  model = Sequential()
  model.add(Dense(u, input_dim=X_train.shape[1], activation='relu')) # Hidden 1, 60
  model.add(Dense(units=u2,activation='relu')) # Hidden 2, 30
  model.add(Dense(units=u3,activation='relu'))
  model.add(Dense(units=u4,activation='relu'))
  model.add(Dense(units=u5,activation='relu'))
  model.add(Dense(units=15,activation='relu')) #15
  model.add(Dense(units=1)) #,activation='relu'
  model.compile(loss='mean_squared_error', optimizer='nadam') #adam, nadam; adamax
  m1 = model.fit(X_train, y_train, verbose=0, epochs=e, callbacks=[early_stop]); #callbacks=[early_stop]

  y_train_pred = model.predict(X_train)
  #print(y_train_pred)

  y_test_pred = model.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #model.save('/content/drive/MyDrive/Models/Keras_Model', save_format="h5")

In [None]:
from sklearn.tree import DecisionTreeRegressor

def DTR_model(X,y,leafs):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  # We introduce regularization by increasing the value of min_samples_leaf
  tree_reg_regularized = DecisionTreeRegressor(random_state=42, min_samples_leaf=leafs)
  tree_reg_regularized.fit(X_train, y_train)
  y_train_pred = tree_reg_regularized.predict(X_train) #_regularized
  #print(y_train_pred)

  y_test_pred = tree_reg_regularized.predict(X_test) #_regularized

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import LinearSVR

def SVM_model(X,y,ep):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]
  svm_reg = LinearSVR(epsilon=ep, random_state=42) #default: epsilon = 0 tol=0.0001, C=1.0
  svm_reg.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = svm_reg.predict(X_train)
  #print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  y_test_pred = svm_reg.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import SVR

def SVM_models(X,y, choice, ep, C_value, predict_df):
  X_train, X_test, y_train, y_test = X[10:30], X[0:10], y[10:30], y[0:10]

  if(choice=="rbf"):
    model = SVR(kernel="rbf", C=C_value, gamma=0.1, epsilon=ep) #0.1 default ep; 100 default C, 0.1 default gamma
    model.fit(X_train, y_train)

  if(choice=="poly"):
    model = SVR(kernel="poly", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  if(choice == "linear"):
    model = SVR(kernel="linear", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = model.predict(X_train)
  #print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  #print()

  #Test Predictions:
  y_test_pred = model.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_preds = model.predict(predict_df)

  return y_test_pred, y_train_pred, final_preds

In [None]:
#In case I wanted to add these values to data frame of inputs
scaler = StandardScaler()
S = df_22["Avg/SOS_ScoringD"].values
S3 = S.reshape(-1, 1)
data_scaledOPPG = scaler.fit_transform(S3)

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
df.iloc[:, 203:]

In [None]:
df.columns

In [141]:
#Feature Importance:
scaler = StandardScaler() #MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(df_NP[10:30]), columns = df_NP.columns)
data_correlated = correlation(data_scaled, .26, df['GF/GA_Playoffs']) #.115 lowest for All csv
data_correlated_df = pd.DataFrame(data_correlated)
data_correlated_df2 = data_correlated_df.transpose() #Correlated inputs
X = df_NP.loc[:, data_correlated_df2.columns] #get non scaled data with important features

#Train test split and scale:
X_train, X_test = X[10:30], X[0:10]
X_train_processed = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test_processed = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
correlated_scaled_data = pd.merge(X_train_processed, X_test_processed, how = 'outer')

#PCA:
pca=PCA(n_components = 10)
X_train_processed_PCA = pca.fit_transform(X_train_processed)
X_train_PCA_df = pd.DataFrame(X_train_processed_PCA)
X_test_processed_PCA = pca.fit_transform(X_test_processed)
X_test_PCA_df = pd.DataFrame(X_test_processed_PCA)
data_PCA = pd.merge(X_test_PCA_df, X_train_PCA_df, how = 'outer')
print("Principal axes:\n", pca.components_.tolist())
print("Explained variance:\n", pca.explained_variance_.tolist())
print("Mean:", pca.mean_)

#LLE:
#X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=15, random_state=42) #n_components=2 is default, neighbors 5 is default
X_unrolled_train = lle.fit_transform(X_train_processed)
X_train_LLE_df = pd.DataFrame(X_unrolled_train)
X_unrolled_test = lle.transform(X_test_processed)
X_test_LLE_df = pd.DataFrame(X_unrolled_test)
data_LLE = pd.merge(X_test_LLE_df, X_train_LLE_df, how = 'outer')

X = correlated_scaled_data #try correlated_scaled_data, data_PCA, or data_LLE
y = df['GF/GA_Playoffs'] #

Principal axes:
 [[-0.23686773926600255, -0.16888537292094707, 0.2099359535329454, -0.3312313593159493, 0.326082373419008, -0.005782803197161005, -0.09802820239206782, 0.17093969547710797, -0.009563303152510286, -0.005782803197160942, -0.3420942964624609, -0.170920822985746, -0.28922372677805014, 0.17093969547710805, -0.14735458038942914, 0.047121903571578964, 0.04455297032701621, 0.018292090463592127, 0.014147313480410983, 0.05271409445254018, 0.04967471136371828, 0.0323167162906176, 0.03550698459722833, -0.5683805603212041, -0.009563303152510347], [0.02924799332642015, 0.0002401486596886621, 0.0276527562582351, -0.1827987645663812, -0.037461013771347196, -0.23888539378961243, -0.026014644822003823, -0.01963790381919664, 0.08561149663832603, -0.23888539378961238, -0.20228463107724115, 0.00024304299326222562, 0.060282571882730694, -0.01963790381919664, 0.051176622870761086, 0.0670192883459234, 0.06185986560826507, 0.38123819386562596, 0.38596241932910774, 0.060022370971220505, 0.063220

In [None]:
len(data_correlated)

26

In [None]:
data_correlated

In [None]:
X.to_excel("GF_GA.xlsx")

**P%**

In [None]:
#Model: (P%_Playoffs) .395
RLE_Model(X, y, "Ridge", X) #6.2%, 1.1%, correlated
RLE_Model(X, y, "Lasso", X) #
SGD_model(X,y, 1e-3, 0.1) #
GBR_model(X,y, .0001, 0.01, 100, X) #7.8%, 2.5%, LLE
print("5:")
DTR_model(X,y, 100) #

In [None]:
SVM_models(X, y, "poly", .1, 100, X) #6.9%, 1.1%, correlated
SVM_models(X, y, "linear", .1, 100, X) #6.8%, 1.3%, correlated
SVM_model(X,y, 0.0) #

In [None]:
RLE_Model(X, y, "Elastic") #
BR_model(X,y) #5.6%, 2.7%, correlated
Keras_model(X, y, 200, 120, 60, 30, 20, 15) #

In [None]:
#P% Blender: 5.9%, .1%
predict_P = pd.read_csv('P%_inputs.csv')
scaled_data = scaler.transform(predict_P)
predict_unrolled = pd.DataFrame(lle.transform(scaled_data))
loaded_BR_model = pickle.load(open("BR_model P%", "rb"))

X = correlated_scaled_data
preds = (loaded_BR_model.predict(X[0:10]), loaded_BR_model.predict(X[10:30]), loaded_BR_model.predict(predict_P)) #5.6%, 2.7%, correlated
preds2 = RLE_Model(X, y, "Ridge", scaled_data) #6.2%, 1.1%, correlated
preds3 = SVM_models(X, y, "linear", .1, 100, scaled_data) #6.8%, 1.3%, correlated

train_preds = (preds[1] + preds2[1] + preds3[1])/3
test_preds = (preds[0] + preds2[0] + preds3[0])/3 #5.9%, .1%

print("Blender Train Scores then Test Scores:")
Scores(y[10:30], train_preds, y)
print()
Scores(y[0:10], test_preds, y)
print()

P_predictions = pd.DataFrame()
P_predictions["Predictions"] = (preds[2] + preds2[2] + preds3[2])/3
P_predictions.to_excel("P%_Predictions.xlsx")

**Per GM**

In [None]:
#Model: (GF/GA_Playoffs) .26
RLE_Model(X, y, "Ridge", X) #13%, 2.6%, correlated
RLE_Model(X, y, "Lasso", X) #
SGD_model(X,y, 1e-3, 0.1)
GBR_model(X,y, .0001, 0.01, 100, X) #
print("5:")
DTR_model(X,y, 100) #

In [None]:
SVM_models(X, y, "rbf", .1, 100, X) #
SVM_models(X, y, "poly", .08, 500, X) #10.7%, 3.1%, LLE
SVM_models(X, y, "linear", .05, 100, X) #13.8%, 2.4%, LLE
SVM_model(X,y, 0) #13.7%, 2.5%, LLE

In [None]:
RLE_Model(X, y, "Elastic", X) #13.1, 2.6%, correlated
BR_model(X,y) #
Keras_model(X, y, 200, 120, 60, 30, 20, 15) #

In [None]:
#GF/GA Blender:
predict_GF_GA = pd.read_csv('GF_GA.csv')
predict_GF_GA.drop("Unnamed: 0", axis = 1, inplace = True)
scaled_data = scaler.transform(predict_GF_GA)
predict_unrolled = pd.DataFrame(lle.transform(scaled_data))

X = correlated_scaled_data
preds2 = RLE_Model(X, y, "Ridge", scaled_data) #13%, 2.6%, correlated
#preds3 = RLE_Model(X, y, "Elastic", scaled_data) #13.1, 2.6%, correlated

X = data_LLE
preds = SVM_models(X, y, "poly", .08, 500, predict_unrolled) #10.7%, 3.1%, LLE
preds4 = SVM_models(X, y, "linear", .05, 100, predict_unrolled) #13.8%, 2.4%, LLE

train_preds = (preds[1] + preds2[1])/2
test_preds = (preds[0] + preds2[0])/2 #11.1%, 2.7%

print("Blender Train Scores then Test Scores:")
Scores(y[10:30], train_preds, y)
print()
Scores(y[0:10], test_preds, y)
print()

GF_GA_predictions = pd.DataFrame()
GF_GA_predictions["Predictions"] = preds[2]
GF_GA_predictions["Predictions2"] = (preds[2] + preds2[2])/2
GF_GA_predictions.to_excel("GF_GA_Predictions.xlsx")

In [None]:
#Check for collinearity
import seaborn as sns
#sns.pairplot(data_correlated_df2)

corr = data_correlated_df2.corr()
print(corr)

In [None]:
#Get Pearson's correlation between 2 variables
df_16.iloc[:,-3].corr(df_Playoffs_16.iloc[:,-2])