In [1]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns




In [2]:
# Read the csv file
df = pd.read_csv('data/PGA_STAT_2024_ALL_DATA.csv')

# Soer df by column name
df.sort_index(axis = 1, inplace=True)

# list of percent columns
list = ['Fairway%','GIR%','GIR_off_Fairway%',
        'GIR_on_Fairway%','Putting_Birdie_Conversions',
        'Scrambling%','Scrambling_Rough%','Scrambling_Sand%']
for col in list:
    df[col] = df[col].str.rstrip('%').astype('float') / 100

# Make a list of column names that we want to remove
columns_to_drop = ['REMOVE10a','REMOVE10b','REMOVE10c','REMOVE10d',
                  'REMOVE11a','REMOVE11b','REMOVE11c','REMOVE11d',
                  'REMOVE12a','REMOVE12b','REMOVE12c','REMOVE12d','REMOVE12e',
                  'REMOVE13a','REMOVE13b','REMOVE13c','REMOVE13d','REMOVE13e',
                  'REMOVE14a','REMOVE14b','REMOVE14c','REMOVE14d',
                  'REMOVE15a','REMOVE15b','REMOVE15c','REMOVE15d',
                  'REMOVE16a','REMOVE16b','REMOVE16c','REMOVE16d',
                  'REMOVE17a','REMOVE17b','REMOVE17c','REMOVE17d',
                  'REMOVE18a','REMOVE18b','REMOVE18c','REMOVE18d',
                  'REMOVE19a','REMOVE19b','REMOVE19c','REMOVE19d',
                  'REMOVE1a','REMOVE1b','REMOVE1c','REMOVE1d',
                  'REMOVE20a','REMOVE20b','REMOVE20c','REMOVE20d',
                  'REMOVE21a','REMOVE21b','REMOVE21c','REMOVE21d','REMOVE21e','REMOVE21f',
                  'REMOVE2a','REMOVE2b','REMOVE2c','REMOVE2d',
                  'REMOVE3a','REMOVE3b','REMOVE3c','REMOVE3d',
                  'REMOVE4a','REMOVE4b','REMOVE4c','REMOVE4d',
                  'REMOVE5a','REMOVE5b','REMOVE5c','REMOVE5d',
                  'REMOVE6a','REMOVE6b','REMOVE6c','REMOVE6d',
                  'REMOVE7a','REMOVE7b','REMOVE7c','REMOVE7d',
                  'REMOVE8a','REMOVE8b','REMOVE8c','REMOVE8d',
                  'REMOVE9a','REMOVE9b','REMOVE9c','REMOVE9d',
                  'PLAYER'
                  ]

# Drop columns
df.drop(columns = columns_to_drop, inplace = True)


In [3]:
features1 = df[['SG_APP_AVG']]
target1 = df[['Strokes_Adj_AVG']]
X_train1, X_test1, y_train1, y_test1 = train_test_split(features1, target1, test_size = 0.2, random_state = 42)

features2 = df[['SG_ARG_AVG']]
target2 = df[['Strokes_Adj_AVG']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features2, target2, test_size = 0.2, random_state = 42)

features3 = df[['SG_DRIVE_AVG']]
target3 = df[['Strokes_Adj_AVG']]
X_train3, X_test3, y_train3, y_test3 = train_test_split(features3, target3, test_size = 0.2, random_state = 42)

features4 = df[['SG_PUTTING_AVG']]
target4 = df[['Strokes_Adj_AVG']]
X_train4, X_test4, y_train4, y_test4 = train_test_split(features4, target4, test_size = 0.2, random_state = 42)

In [4]:
# Import methods created in the utils folder
from utils.metrics import evaluate_model
import joblib

model_linear = LinearRegression()

model_linear.fit(X_train1, y_train1)
y_pred_linear1 = model_linear.predict(X_test1)
mse_linear1, r2_linear1 = evaluate_model(y_test1, y_pred_linear1)
print("SG_APP vs Strokes:")
print(f"Mean Squared Error: {mse_linear1}")
print(f"R^2 Score: {r2_linear1}")
joblib_file = '../Sports-Analytics-Machine-Learning/models/Linear_SG_APP.h5'
joblib.dump(model_linear, joblib_file)

model_linear.fit(X_train2, y_train2)
y_pred_linear2 = model_linear.predict(X_test2)
mse_linear2, r2_linear2 = evaluate_model(y_test2, y_pred_linear2)
print("SG_ARG_AVG vs Strokes:")
print(f"Mean Squared Error: {mse_linear2}")
print(f"R^2 Score: {r2_linear2}")
joblib_file = '../Sports-Analytics-Machine-Learning/models/Linear_SG_ARG_AVG.h5'
joblib.dump(model_linear, joblib_file)

model_linear.fit(X_train3, y_train3)
y_pred_linear3 = model_linear.predict(X_test3)
mse_linear3, r2_linear3 = evaluate_model(y_test3, y_pred_linear3)
print("SG_DRIVE_AVG vs Strokes:")
print(f"Mean Squared Error: {mse_linear3}")
print(f"R^2 Score: {r2_linear3}")
joblib_file = '../Sports-Analytics-Machine-Learning/models/Linear_SG_DRIVE_AVG.h5'
joblib.dump(model_linear, joblib_file)

model_linear.fit(X_train4, y_train4)
y_pred_linear4 = model_linear.predict(X_test4)
mse_linear4, r2_linear4 = evaluate_model(y_test4, y_pred_linear4)
print("SG_PUTTING_AVG vs Strokes:")
print(f"Mean Squared Error: {mse_linear4}")
print(f"R^2 Score: {r2_linear4}")
joblib_file = '../Sports-Analytics-Machine-Learning/models/Linear_SG_PUTTING_AVG.h5'
joblib.dump(model_linear, joblib_file)

SG_APP vs Strokes:
Mean Squared Error: 0.3241495153949111
R^2 Score: 0.48417314679566126
SG_ARG_AVG vs Strokes:
Mean Squared Error: 0.4667238429303513
R^2 Score: 0.25729121969873914
SG_DRIVE_AVG vs Strokes:
Mean Squared Error: 0.45631117700642626
R^2 Score: 0.27386114327386046
SG_PUTTING_AVG vs Strokes:
Mean Squared Error: 0.5149909474844374
R^2 Score: 0.18048262529104275


['../Sports-Analytics-Machine-Learning/models/Linear_SG_PUTTING_AVG.h5']

In [5]:
# TO load model from joblib
# Load the model
joblib_file1 = '../Sports-Analytics-Machine-Learning/models/Linear_SG_APP.h5'
joblib_file2 = '../Sports-Analytics-Machine-Learning/models/Linear_SG_ARG_AVG.h5'
joblib_file3 = '../Sports-Analytics-Machine-Learning/models/Linear_SG_DRIVE_AVG.h5'
joblib_file4 = '../Sports-Analytics-Machine-Learning/models/Linear_SG_PUTTING_AVG.h5'

loaded_model_SG_APP = joblib.load(joblib_file1)
loaded_model_SG_ARG_AVG = joblib.load(joblib_file2)
loaded_model_SG_DRIVE_AVG = joblib.load(joblib_file3)
loaded_model_SG_PUTTING_AVG = joblib.load(joblib_file4)

# Use the loaded model to make predictions
sg_app_var = np.array([[0.5]])
sg_arg_avg_var = np.array([[1.2]])
sg_drive_avg_var = np.array([[-1.3]])
sg_putting_avg_var = np.array([[0.6]])

new_prediction_sg_app_var = loaded_model_SG_APP.predict(sg_app_var)
new_prediction_sg_arg_avg_var = loaded_model_SG_ARG_AVG.predict(sg_arg_avg_var)
new_prediction_sg_drive_avg_var = loaded_model_SG_DRIVE_AVG.predict(sg_drive_avg_var)
new_prediction_sg_putting_avg_var = loaded_model_SG_PUTTING_AVG.predict(sg_putting_avg_var)

print(new_prediction_sg_app_var)
print(new_prediction_sg_arg_avg_var)
print(new_prediction_sg_drive_avg_var)
print(new_prediction_sg_putting_avg_var)

# Flatten the predicted values b/c they are 2d arrays
#new_prediction_sg_app_var = new_prediction_sg_app_var.flatten().astype(int)
#new_prediction_sg_arg_avg_var = new_prediction_sg_arg_avg_var.flatten().astype(int)
#new_prediction_sg_drive_avg_var = new_prediction_sg_drive_avg_var.flatten().astype(int)
#new_prediction_sg_putting_avg_var = new_prediction_sg_putting_avg_var.flatten().astype(int)


[[70.50271694]]
[[69.56569753]]
[[72.27825648]]
[[70.71545856]]




In [6]:
# Ecample of how to use the loaded model and use it to predict
from tensorflow.keras.models import load_model

input1 = int(new_prediction_sg_app_var[0,0])
input2 = int(new_prediction_sg_arg_avg_var[0,0])
input3 = int(new_prediction_sg_drive_avg_var[0,0])
input4 = int(new_prediction_sg_putting_avg_var[0,0])

# Loading the model to make predictions
loaded_model = load_model('../Sports-Analytics-Machine-Learning/models/NN_4Input.h5')


new_data = np.array([[input1, input2, input3, input4]])

new_predictions = loaded_model.predict(new_data)

print(new_predictions)

TypeError: Error when deserializing class 'InputLayer' using config={'batch_shape': [None, 4], 'dtype': 'float32', 'sparse': False, 'name': 'input_layer_1'}.

Exception encountered: Unrecognized keyword arguments: ['batch_shape']