In [32]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
%matplotlib inline
df = pd.read_csv('College Football_Upload.csv')
df.drop(df.tail(173).index,
        inplace = True)
df_NP = df.drop(columns=df.columns[-7:], axis=1)
df_22 = df_NP.dropna(axis=1, how='any') # drop columns with NA values

df_16 = df_NP.drop(df.tail(6).index)
df_Playoffs_16 = df.drop(df.tail(6).index)
df_16.dropna(axis=1, how='any', inplace = True)

In [None]:
df_16.isnull().sum().sum() #Check if there are NA values

In [None]:
df_22.isnull().sum().sum() #Check if there are NA values

In [33]:
import scipy.stats
def correlation(dataset, threshold, target): #Function to get Pearson's correlation between input and target
  data = []
  for i in range(len(dataset.columns)):
      cor2 = dataset.iloc[:,i].corr(target) #scipy.stats.spearmanr(x, y)[0] and scipy.stats.kendalltau(x, y)[0]
      column_headers = list(dataset.columns.values)
      if(abs(cor2) > threshold):
        data.append(dataset.iloc[:,i]) #make list of columns that meet the threshold
      i = i + 1
  return data

In [None]:
from sklearn.model_selection import train_test_split
from numpy.random.mtrand import random_sample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def Scores(y, y_pred, y_full):
  MSE = mean_squared_error(y, y_pred)
  MAE = mean_absolute_error(y, y_pred)
  Normalized_RMSE = (np.sqrt(MSE)/np.mean(y_full))*100
  Normalized_MAE = (MAE/np.mean(y_full))*100
  Avg_Normalized_Score = (Normalized_RMSE + Normalized_MAE)/2
  print(f'Avg. Normalized Score:{ Avg_Normalized_Score:.1f}%')
  print(f'Normalized RMSE:{ Normalized_RMSE:.1f}%')
  print(f'Normalized MAE:{ Normalized_MAE:.2f}%')
  #print(f'MAE:{ MAE:.3f}')
  #print(f'RMSE:{ np.sqrt(MSE):.3f}')
  return Avg_Normalized_Score

In [34]:
def RLE_Model(X, y, choice, predict_df): #Function to run Ridge, Lasso, or ElasticNet model
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) #Train/Test

  if(choice=="Ridge"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(RidgeCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Lasso"):
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(LassoCV(alphas=alphas))
    pipeline.fit(X_train, y_train)

  if(choice=="Elastic"):
    l1_ratio = [0, 0.3, 0.5, 0.7, 0.9, 1]
    alphas = np.geomspace(1e-10, 1e10, num=100)
    pipeline = make_pipeline(ElasticNetCV(alphas=alphas, l1_ratio=l1_ratio, max_iter=100000))
    pipeline.fit(X_train, y_train)

  print(f'Chosen alpha  {pipeline.steps[0][1].alpha_:.6f}')
  print(f'Intercept (b) {pipeline.steps[0][1].intercept_:.6f}')
  print(pd.Series(pipeline.steps[0][1].coef_, index=X.columns),'\n')

  #Calculate the predicted values:
  y_train_pred = pipeline.predict(X_train)
  print(y_train_pred)
  print()

  y_test_pred = pipeline.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = pipeline.predict(predict_df)
  print(predictions)

In [35]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def GBR_model(X,y, t, l, n, predict_df):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg = GradientBoostingRegressor(tol = t, learning_rate = l, n_estimators=n, random_state=0) #default: tol = 0.0001, learning rate - 0.1, 100, friedman_mse
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  print(predictions)

In [None]:
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb

def BR_model(X,y, predict_df):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg = BaggingRegressor(base_estimator=xgb.XGBRegressor())
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  #Predict:
  predictions = reg.predict(predict_df)
  print(predictions)

In [36]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#SGD Regressor:
def SGD_model(X,y, t, ep, predict_df):

  reg = make_pipeline(SGDRegressor(max_iter=1000, tol=t, epsilon = ep)) #tol = 0.001, epsilon=0.1
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  #print(y_train_pred)

  y_test_pred = reg.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  predictions = reg.predict(predict_df)
  print(predictions)

In [None]:
#Keras Sequential Neural Net
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=2)

def Keras_model(X,y,e, u, u2, u3, u4, u5):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
  model = Sequential()
  model.add(Dense(u, input_dim=X_train.shape[1], activation='relu')) # Hidden 1, 60
  model.add(Dense(units=u2,activation='relu')) # Hidden 2, 30
  model.add(Dense(units=u3,activation='relu'))
  model.add(Dense(units=u4,activation='relu'))
  model.add(Dense(units=u5,activation='relu'))
  model.add(Dense(units=15,activation='relu')) #15
  model.add(Dense(units=1)) #,activation='relu'
  model.compile(loss='mean_squared_error', optimizer='nadam') #adam, nadam; adamax
  m1 = model.fit(X_train, y_train, verbose=0, epochs=e, callbacks=[early_stop]); #callbacks=[early_stop]

  y_train_pred = model.predict(X_train)
  #print(y_train_pred)

  y_test_pred = model.predict(X_test)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  #print(y_test_pred)
  #print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

  model.save('/content/drive/MyDrive/Models/myModel7', save_format="h5")

In [None]:
from sklearn.tree import DecisionTreeRegressor

def DTR_model(X,y,leafs):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  # We introduce regularization by increasing the value of min_samples_leaf
  tree_reg_regularized = DecisionTreeRegressor(random_state=42, min_samples_leaf=leafs)
  tree_reg_regularized.fit(X_train, y_train)
  y_train_pred = tree_reg_regularized.predict(X_train) #_regularized
  print(y_train_pred)

  y_test_pred = tree_reg_regularized.predict(X_test) #_regularized

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Predictions:
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import LinearSVR

def SVM_model(X,y,ep):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
  svm_reg = LinearSVR(epsilon=ep, random_state=42) #default: epsilon = 0 tol=0.0001, C=1.0
  svm_reg.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = svm_reg.predict(X_train)
  print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  y_test_pred = svm_reg.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
from sklearn.svm import SVR

def SVM_rbf_model(X,y, choice, ep, C_value):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

  if(choice=="rbf"):
    model = SVR(kernel="rbf", C=C_value, gamma=0.1, epsilon=ep) #0.1 default ep; 100 default C, 0.1 default gamma
    model.fit(X_train, y_train)

  if(choice=="poly"):
    model = SVR(kernel="poly", C=C_value, gamma="auto", degree=3, epsilon=ep, coef0=1) #0.1 default ep; 100 default C
    model.fit(X_train, y_train)

  #Train Predictions:
  y_train_pred = model.predict(X_train)
  print(y_train_pred)

  #Training Scores:
  Avg_N_Score_train = Scores(y_train, y_train_pred, y)
  print()

  #Test Predictions:
  y_test_pred = model.predict(X_test)
  print("Test predictions:")
  print(y_test_pred)
  print()

  #Testing Scores:
  Avg_N_Score_test = Scores(y_test, y_test_pred, y)
  print(f'Difference of avg scores:{ Avg_N_Score_test - Avg_N_Score_train:.3f}%') #Difference between testing and traing scores to check if my bias-variance tradeoff is good
  print()

In [None]:
#In case I wanted to add these values to data frame of inputs
scaler = StandardScaler()
S = df_22["Avg/SOS_ScoringD"].values
S3 = S.reshape(-1, 1)
data_scaledOPPG = scaler.fit_transform(S3)

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
#Scale:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(df_16), columns = df_16.columns)

#Get Predictors:
data_correlated = correlation(data_scaled, .4, df_Playoffs_16["Points/GM_Playoffs"]) #.115 lowest for All csv
data_correlated_df = pd.DataFrame(data_correlated)
data_correlated_df2 = data_correlated_df.transpose() #Correlated inputs
#data_correlated_df2['Avg/SOS_ScoringD'] = data_scaledOPPG
#data_correlated_df2['Gms_Playoffs'] = data_scaledGMs
pca=PCA(n_components = 4)
data_PCA = pca.fit_transform(data_correlated_df2) #PCA inputs
#X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=3, random_state=42) #n_components=2 is default, neighbors 5 is default
X_unrolled = lle.fit_transform(data_correlated_df2)
dfLLE = pd.DataFrame(X_unrolled)
print("Principal axes:\n", pca.components_.tolist())
print("Explained variance:\n", pca.explained_variance_.tolist())
print("Mean:", pca.mean_)
X = data_PCA #try data_correlated_df2, data_PCA, or dfLLE
y = df_Playoffs_16["Points/GM_Playoffs"]

In [None]:
len(data_correlated)

In [39]:
#Load in input data from this year's 4 CFP teams to predict oPPG
predictors_defense_df = pd.read_csv('Predictors_Defense_2023.csv')
predictors_defense_df.drop(predictors_defense_df.tail(2).index,
        inplace = True)
predictors_defense = lle.fit_transform(predictors_defense_df)

In [None]:
#Predict oPPG
#oPPG Blender: #22.1%; 4.4% difference
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
oPPG_model = tensorflow.keras.models.load_model('/content/drive/MyDrive/Models/myModel1') #23.6%
preds = oPPG_model.predict(predictors_defense)
oPPG_model2 = tensorflow.keras.models.load_model('/content/drive/MyDrive/Models/myModel4') #24.8%
preds2 = oPPG_model2.predict(predictors_defense)
oPPG_model3 = tensorflow.keras.models.load_model('/content/drive/MyDrive/Models/myModel6') #22.8%
preds3 = oPPG_model3.predict(predictors_defense)
oPPG_model4 = tensorflow.keras.models.load_model('/content/drive/MyDrive/Models/myModel2') #22.8%, -2.4%, model2, 400, 210, 200, 190, 100, 36
preds4 = oPPG_model4.predict(predictors_defense)
final_test_preds = (preds + preds2 + preds3 + preds4)/4
print(final_test_preds)
#Scores(y_train, final_test_preds, y)

[[36.35722 ]
 [39.6763  ]
 [49.715473]
 [36.920113]]


In [40]:
#Load in input data from this year's 4 CFP teams to predict PPG
predictors_df = pd.read_csv('Predictors_2023.csv')
predictors_df.drop(predictors_df.tail(10).index,
        inplace = True)
predictors = pca.fit_transform(predictors_df)

In [38]:
#Predict PPG:
GBR_model(X,y, .0001, 0.008, 100, predictors)

Avg. Normalized Score:11.4%
Normalized RMSE:12.2%
Normalized MAE:10.46%

Test predictions:
[30.57090341 39.04558959 34.01120864 33.93328673 39.04558959]

Avg. Normalized Score:11.5%
Normalized RMSE:13.1%
Normalized MAE:9.94%
Difference of avg scores:0.160%

[40.88892146 29.6119849  29.6119849  41.02980684]


array([30.57090341, 39.04558959, 34.01120864, 33.93328673, 39.04558959])

In [None]:
#PPG Model Blender:
y_test1 = RLE_Model(X, y, "Lasso", predictors) #.4, data_PCA
y_test2 = GBR_model(X,y, .0001, 0.008, 100, predictors)
y_test3 = SGD_model(X,y, 1e-3, 0.1, predictors) #.4, data_PCA
final_test_preds = (y_test1 + y_test2 + y_test3)/3
y_train, y_test = train_test_split(y, test_size=0.30, random_state=0)
Scores(y_test, final_test_preds, y) #10.3%

In [None]:
#Model: (Points Scored)
RLE_Model(X, y, "Lasso", predictors) #10.6% for Lasso, diff 4.2%, .4, data_PCA
SGD_model(X,y, 1e-3, 0.1, predictors) #9.9% for data_PCA, diff 4.5%., .4, 1e-3, .1
GBR_model(X,y, .0001, 0.1, 100, predictors) #11.5%, .1%, .4, data_PCA, .0001, .008, 100
BR_model(X,y, predictors)
Keras_model(X, y, 200, 120, 60, 30) #9.9% for points per play dfLLE, .45, 100, 40, 30
DTR_model(X,y, 100) #12.2% for data PCA, 12% diff, 100, .5, other set
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .1, 100) #.1; 100 default, 16.2%, -.3% diff, .45, 5.5, 8, data_PCA,
SVM_rbf_model(X, y, "poly", .1, 100) #.1; 100 default, 17.5%, .3% diff, .4, 6, 100, data_correlated_df2

In [None]:
#Model: (points allowed)
RLE_Model(X, y, "Ridge", predictors_defense) #Ridge
SGD_model(X,y, 1e-3, 0.1, predictors_defense) #0.1 default
GBR_model(X,y, .0001, 0.1, 100, predictors_defense) #0.1 default
BR_model(X,y, predictors_defense)
Keras_model(X, y, 400, 210, 200, 190, 100, 36) #22.8%, -2.4%, model2, 400, 210, 200, 190, 100, 36
DTR_model(X,y, 100) #100 default
SVM_model(X,y, 0) #0 default
SVM_rbf_model(X, y, "rbf", .1, 100) #.1; 100 default
SVM_rbf_model(X, y, "poly", .1, 100) #.1; 100 default, 21% for per play, dfLLE, .12, 180

In [None]:
len(data_correlated)

In [None]:
data_correlated_df2.columns

In [None]:
len(dfLLE)

In [None]:
len(data_PCA)

In [None]:
#Check for collinearity
import seaborn as sns
#sns.pairplot(data_correlated_df2)

corr = data_correlated_df2.corr()
print(corr)

In [None]:
#Get Pearson's correlation between 2 variables
df_16.iloc[:,-3].corr(df_Playoffs_16.iloc[:,-2])