In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
%matplotlib inline
df = pd.read_csv('NFL Upload.csv')
df_NP = df.drop(columns=df.columns[-24:], axis=1) #get rid of playoff stats except experience

df_22 = df_NP.dropna(axis=1, how='any') #drop columns with NA values for df_NP
df_20 = df_NP.drop(df_NP.head(2).index) #drop 1st 2 rows
df_20 = df_20.dropna(axis=1, how='any') #drop columns with NA values for df_20

df_20_P = df.drop(df.head(2).index) #data frame with playoffs and 1st 2 rows dropped

In [None]:
df_22.isnull().sum().sum() #Check if there are NA values

In [None]:
df_20.isnull().sum().sum() #Check if there are NA values

In [None]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  for i in range(len(dataset.columns)):
      cor2 = dataset.iloc[:,i].corr(target) #scipy.stats.spearmanr(x, y)[0] and scipy.stats.kendalltau(x, y)[0]
      column_headers = list(dataset.columns.values)
      if(abs(cor2) > threshold):
        data.append(dataset.iloc[:,i]) #make list of columns that meet the threshold
      i = i + 1
  return data

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred, y_full):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)
  Normalized_RMSE = (np.sqrt(MSE)/np.mean(y_full))*100
  Normalized_MAE = (MAE/np.mean(y_full))*100
  Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
  print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}%')
  print(f'Normalized RMSE:{ Normalized_RMSE:.1f}%')
  print(f'Normalized MAE:{ Normalized_MAE:.2f}%')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Avg_Normalized_Score

In [None]:
def RLE_Model(X, y, choice, predict_df): #Function to run Ridge, Lasso, or ElasticNet model
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) #Train/Test

  if(choice=="Ridge"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(RidgeCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Lasso"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(LassoCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Elastic"):
    l1_ratio = [0, 0.3, 0.5, 0.7, 0.9, 1]
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(ElasticNetCV(alphas=alphas, l1_ratio=l1_ratio, max_iter=100000))
    pipeline.fit(X_train, y_train)

  #print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  #print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  #print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')

  #Calculate the predicted values:
  y_train_pred = pipeline.predict(X_train)
  print(y_train_pred)
  print()

  y_test_pred = pipeline.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_predictions = pipeline.predict(predict_df)

  return y_train_pred, y_test_pred, final_predictions

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def GBR_model(X,y, t, l, n, predict_df):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg = GradientBoostingRegressor(tol = t, learning_rate = l, n_estimators=n, random_state=0) #default: tol = 0.0001, learning rate - 0.1, 100, friedman_mse
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  print(predictions)

In [None]:
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb

def BR_model(X,y, predict_df):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg = BaggingRegressor(base_estimator=xgb.XGBRegressor())
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  print(predictions)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#SGD Regressor:
def SGD_model(X,y, t, ep, predict_df):

  reg = make_pipeline(SGDRegressor(max_iter=1000, tol=t, epsilon = ep)) #tol = 0.001, epsilon=0.1
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_predictions = reg.predict(predict_df)
  print(final_predictions)

  return y_train_pred, y_test_pred, final_predictions

In [None]:
#Keras Sequential Neural Net
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=2)

def Keras_model(X,y, e, u, u2, u3, u4, u5):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
  model = Sequential()
  model.add(Dense(u, input_dim=X_train.shape[1], activation='relu')) # Hidden 1, 60
  model.add(Dense(units=u2,activation='relu')) # Hidden 2, 30
  model.add(Dense(units=u3,activation='relu'))
  model.add(Dense(units=u4,activation='relu'))
  model.add(Dense(units=u5,activation='relu'))
  model.add(Dense(units=15,activation='relu')) #15
  model.add(Dense(units=1)) #,activation='relu'
  model.compile(loss='mean_squared_error', optimizer='nadam') #adam, nadam; adamax
  m1 = model.fit(X_train, y_train, verbose=0, epochs=e, callbacks=[early_stop]); #callbacks=[early_stop]

  y_train_pred = model.predict(X_train)
  #print(y_train_pred)

  y_test_pred = model.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  model.save('/content/drive/MyDrive/Models/myModel10', save_format="h5")

In [None]:
from sklearn.tree import DecisionTreeRegressor

def DTR_model(X,y,leafs):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  # We introduce regularization by increasing the value of min_samples_leaf
  tree_reg_regularized = DecisionTreeRegressor(random_state=42, min_samples_leaf=leafs)
  tree_reg_regularized.fit(X_train, y_train)
  y_train_pred = tree_reg_regularized.predict(X_train) #_regularized
  print(y_train_pred)

  y_test_pred = tree_reg_regularized.predict(X_test) #_regularized

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import LinearSVR

def SVM_model(X,y,ep):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  svm_reg = LinearSVR(epsilon=ep, random_state=42) #default: epsilon = 0, C=1.0
  svm_reg.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = svm_reg.predict(X_train)
  print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  y_test_pred = svm_reg.predict(X_test)
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import SVR

def SVM_rbf_model(X,y, choice, ep, C_value, predict_df):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

  if(choice=="rbf"):
    model = SVR(kernel="rbf", C=C_value, gamma=0.1, epsilon=ep) #0.1 default ep; 100 default C, 0.1 default gamma
    model.fit(X_train, y_train)

  if(choice=="poly"):
    model = SVR(kernel="poly", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = model.predict(X_train)
  print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  y_test_pred = model.predict(X_test)
  print("Test predictions:")
  #print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  final_preds = model.predict(predict_df)

  return y_train_pred, y_test_pred, final_preds

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
#Scale:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(df_22), columns = df_22.columns)

#Get Predictors:
data_correlated = correlation(data_scaled, .48, df["oPoints/oAtt_Playoffs"])
data_correlated_df = pd.DataFrame(data_correlated)
data_correlated_df2 = data_correlated_df.transpose() #Correlated inputs
pca=PCA(n_components = 14)
data_PCA = pca.fit_transform(data_correlated_df2) #PCA inputs
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=13, random_state=42) #n_components=2 is default, neighbors 5 is default (can do up to 13)
X_unrolled = lle.fit_transform(data_correlated_df2)
dfLLE = pd.DataFrame(X_unrolled)
print("Principal axes:\n", pca.components_.tolist())
print("Explained variance:\n", pca.explained_variance_.tolist())
print("Mean:", pca.mean_)
X = dfLLE #try data_correlated_df2, data_PCA, or dfLLE
y = df["oPoints/oAtt_Playoffs"]

In [None]:
#Predict PPG:
#PPG Blender:
predict =  pd.read_csv('NFL PPG_Upload.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
#print(data_correlated_df2.columns)
print()
predict_unrolled = lle.fit_transform(predict)
predict_LLE = pd.DataFrame(predict_unrolled)
preds = RLE_Model(X, y, "Lasso", predict_LLE)
preds2 = SVM_rbf_model(X, y, "poly", .1, 200, predict_LLE)

#Get stats on blender
final_train_preds = (preds[0] + preds2[0])/2
final_test_preds = (preds[1] + preds2[1])/2 #7.2%. 3.6%
final_preds = (preds[2] + preds2[2])/2
print("Blender test predictions:")
print(final_test_preds)
print()
print("Blender Train Scores then Test Scores:")
Scores(y_train, final_train_preds, y)
print()
Scores(y_test, final_test_preds, y)
print("Final Predictions:")
print(final_preds)

In [None]:
#Predict Points/Att:
#Points/Att Blender:
predict =  pd.read_csv('NFL Points Per Att.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
print(data_correlated_df2.columns)
print()
predict_unrolled = lle.fit_transform(predict)
predict_LLE = pd.DataFrame(predict_unrolled)
preds = RLE_Model(X, y, "Lasso", predict_LLE)
preds2 = SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE)
preds3 = SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE)

#Get stats on blender
final_train_preds = (preds[0] + preds2[0] + preds3[0])/3
final_test_preds = (preds[1] + preds2[1] + preds3[1])/3 #11.3%, -1%
final_preds = (preds[2] + preds2[2])/2
print("Blender test predictions:")
print(final_test_preds)
print()
print("Blender Train Scores then Test Scores:")
Scores(y_train, final_train_preds, y)
print()
Scores(y_test, final_test_preds, y)
print("Final Predictions:")
print(final_preds)

In [None]:
#Predict oPPG:
predict =  pd.read_csv('NFL oPPG.csv')

#use SGD_model(X,y, 1e-3, .1) #dfLLE, .51

predict_unrolled = lle.fit_transform(predict)
predict_LLE = pd.DataFrame(predict_unrolled)
final_preds = SGD_model(X,y, 1e-2, 0.1, predict_LLE)
print("Final Predictions are")
print(final_preds[2])

In [None]:
#Predict oPoints/oAtt:
predict =  pd.read_csv('oPoints Per oAtt.csv')
predict2 = predict.drop(df_NP.tail(8).index)

predict_unrolled = lle.fit_transform(predict2)
predict_LLE = pd.DataFrame(predict_unrolled)

final_preds = SVM_rbf_model(X, y, "rbf", 0.001, 36, predict_LLE)

print("Final Predictions are")
print(final_preds[2])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
len(data_correlated)

21

In [None]:
data_correlated_df2.columns

In [None]:
data_correlated_df2.to_csv('oPoints Per oAtt.csv')

In [None]:
#Model: (PPG), Use Lasso and poly
RLE_Model(X, y, "Lasso", predict_LLE) #7.7%, -3.6%, Lasso, .4, dfLLE, df_22, 2nd best
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #10.6%, -2.8%, .4, dfLLE, df_22
GBR_model(X,y, .0001, 0.1, 100, predict_LLE)
BR_model(X,y, predict_LLE) #12.7%, 7.5%, .4, dfLLE, df_22
Keras_model(X, y, 200, 120, 60, 30, 25, 20)
DTR_model(X,y, 100)
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE) #.1; 100 default, 9.0%, -3%, .4, dfLLE, df_22
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #.1; 100 default, .1, 200, 7.5%, -3.0%, .4, dfLLE, df_22, best

In [None]:
#Model: (Points/Att)
RLE_Model(X, y, "Lasso", predict_LLE) #11.9%, 0.01%, .42, dfLLE, df_22
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #0.1 default,
GBR_model(X,y, .0001, 0.1, 100, predict_LLE) #0.1 default
BR_model(X,y, predict_LLE)
Keras_model(X, y, 400, 210, 200, 190, 100, 36) #
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0), #13.1%, .8%, .42, dfLLE, df_22
SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE) #.1; 100 default, 11.4%, -1.5%, .42, dfLLE, df_22
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #11.5%, -1.4%, .42, dfLLE, df_22

In [None]:
#Model: (Points/Pass Att)
RLE_Model(X, y, "Lasso", predict_LLE)
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #0.1 default
GBR_model(X,y, .0001, 0.1, 100, predict_LLE) #0.1 default
BR_model(X,y, predict_LLE)
Keras_model(X, y, 400, 210, 200, 190, 100, 36) #
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE) #.07; 200, 12.3%, -1.8%, .53, dfLLE, df_22
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #.08; 50, 12.8%, -.8%, .53, dfLLE, df_22

In [None]:
#Model: (points allowed/GM)
RLE_Model(X, y, "Lasso", predict_LLE) #
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #0.1 default, 1e-2, 14.6%, -0.5%, .53, dfLLE, df_22
GBR_model(X,y, .0001, 0.1, 100, predict_LLE) #0.1 default
BR_model(X,y, predict_LLE)
Keras_model(X, y, 400, 210, 200, 190, 100, 36) #12.8%, -7.1%, .51, dfLLE, df_22
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE) #.1; 100 default, 14.6%, 7.1%, .53, dfLLE, df_22
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #.1; 100 default

In [None]:
#Model: (oPoints/oAtt)
RLE_Model(X, y, "Lasso", predict_LLE) #
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #0.1 default,
GBR_model(X,y, .0001, 0.1, 100, predict_LLE) #0.1 default
BR_model(X,y, predict_LLE)
Keras_model(X, y, 400, 210, 200, 190, 100, 31) #model8, 10.3%, 3.7%, .48, dfLLE, df_22
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .001, 66, predict_LLE) #11.6%, 4.7%, .48, dfLLE, df_22
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #.1; 100 default, 14.7%, -2%, .48, dfLLE, df_22

In [None]:
#Model: (oPoints/oPass Att)
RLE_Model(X, y, "Lasso", predict_LLE)#Lasso, 16.4%, 2.6%, .46
SGD_model(X,y, 1e-3, 0.1, predict_LLE) #0.1 default
GBR_model(X,y, .0001, 0.1, 100, predict_LLE) #0.1 default
BR_model(X,y, predict_LLE)
Keras_model(X, y, 400, 210, 200, 190, 100, 31) #
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0) #0 default, 15.8%, 2.3%, .46, dfLLE, df_22
SVM_rbf_model(X, y, "rbf", .1, 100, predict_LLE) #
SVM_rbf_model(X, y, "poly", .1, 100, predict_LLE) #.1; 100 default

In [None]:
len(data_correlated)

In [None]:
data_correlated_df2.columns

In [None]:
len(dfLLE)

In [None]:
len(data_PCA)

In [None]:
#Check for collinearity
import seaborn as sns
#sns.pairplot(data_correlated_df2)

corr = data_correlated_df2.corr()
print(corr)

In [None]:
#Get Pearson's correlation between 2 variables
df_16.iloc[:,-3].corr(df_Playoffs_16.iloc[:,-2])