In [34]:
from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Organize and convert aggregated df to csv

In [70]:
# Read the csv file
df = pd.read_csv('data/aggregated_df.csv')

# Soer df by column name
df.sort_index(axis = 1, inplace=True)

# Convert sorted df into a new csv file
df.to_csv('data/aggregated_df_ordered.csv', index = False)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 96 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   # HOLES(7)                177 non-null    float64
 1   # OF HOLES(16)            176 non-null    float64
 2   # OF PUTTS(16)            176 non-null    float64
 3   # OF SHOTS                177 non-null    float64
 4   %(11)                     177 non-null    object 
 5   %(4)                      177 non-null    object 
 6   %(7)                      177 non-null    object 
 7   AVG DTP(5)                177 non-null    object 
 8   AVG(0)                    177 non-null    float64
 9   AVG(1)                    177 non-null    float64
 10  AVG(10)                   177 non-null    float64
 11  AVG(12)                   177 non-null    float64
 12  AVG(13)                   177 non-null    float64
 13  AVG(14)                   176 non-null    float64
 14  AVG(15)   

## After Looking at the heatmap and sorting the df, we want to now remoev unecessary columns

In [72]:
# Change the names of columns to make it easier to read
df = df.rename(columns = {
    'AVG(1)': 'AVG_SG_PUTTING', 
    'AVG(10)': 'AVG_DISTANCE',
    'AVG(12)': 'AVG_SG',
    'AVG(13)': 'AVG_STROKE',
    'AVG(14)': 'AVG_PUTT_TOTAL',
    'AVG(15)': 'AVG_PUTT',
    'AVG(16)': 'AVG_PUTT_ROUND',
    'AVG(2)': 'REMOVE',
    'AVG(3)': 'AVG_SG_ARG',
    'AVG(6)': 'AVG_SG_APP',
    'AVG(8)': 'REMOVE2',
    'AVG(9)': 'AVG_SG_DRIVING',
    'AVG(0)': 'AVG_STROKES_ADJUSTED',
    })

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 96 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   # HOLES(7)                177 non-null    float64
 1   # OF HOLES(16)            176 non-null    float64
 2   # OF PUTTS(16)            176 non-null    float64
 3   # OF SHOTS                177 non-null    float64
 4   %(11)                     177 non-null    object 
 5   %(4)                      177 non-null    object 
 6   %(7)                      177 non-null    object 
 7   AVG DTP(5)                177 non-null    object 
 8   AVG_STROKES_ADJUSTED      177 non-null    float64
 9   AVG_SG_PUTTING            177 non-null    float64
 10  AVG_DISTANCE              177 non-null    float64
 11  AVG_SG                    177 non-null    float64
 12  AVG_STROKE                177 non-null    float64
 13  AVG_PUTT_TOTAL            176 non-null    float64
 14  AVG_PUTT  

In [73]:
# Make a list of column names that we want to remove
columns_to_drop = ['MOVEMENT(1)','MOVEMENT(10)','MOVEMENT(11)',
                   'MOVEMENT(12)','MOVEMENT(13)','MOVEMENT(14)',
                   'MOVEMENT(2)','MOVEMENT(3)','MOVEMENT(4)',
                   'MOVEMENT(5)','MOVEMENT(6)','MOVEMENT(7)',
                   'MOVEMENT(8)','MOVEMENT(9)','RANK(1)',
                   'RANK(11)','RANK(12)','RANK(13)','RANK(2)','RANK(3)',
                   'RANK(4)','RANK(5)','RANK(6)','RANK(7)','RANK(8)', 'RANK(9)',
                   'RANK(10)', 'REMOVE','PLAYER','MEASURED ROUNDS(1)','MEASURED ROUNDS(12)',
                   'MEASURED ROUNDS(2)','MEASURED ROUNDS(3)','MEASURED ROUNDS(6)',
                   'MEASURED ROUNDS(8)','MEASURED ROUNDS(9)','POSSIBLE FAIRWAYS(11)',
                   'TOTAL DRIVES(10)','TOTAL ROUNDS(13)',
                   'TOTAL SG:PUTTING(1)','TOTAL SG:PUTTING(2)','TOTAL STROKES(13)',
                   'PLAYER_ID','# HOLES(7)','TOTAL DISTANCE(10)',
                   '%(11)', '%(4)', '%(7)', 'AVG DTP(5)','REMOVE2', 'RANK(0)', 'MOVEMENT(0)',
                   'TOTAL STROKES(0)', 'TOTAL ROUNDS(0)']

# Drop columns
df.drop(columns = columns_to_drop, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   # OF HOLES(16)            176 non-null    float64
 1   # OF PUTTS(16)            176 non-null    float64
 2   # OF SHOTS                177 non-null    float64
 3   AVG_STROKES_ADJUSTED      177 non-null    float64
 4   AVG_SG_PUTTING            177 non-null    float64
 5   AVG_DISTANCE              177 non-null    float64
 6   AVG_SG                    177 non-null    float64
 7   AVG_STROKE                177 non-null    float64
 8   AVG_PUTT_TOTAL            176 non-null    float64
 9   AVG_PUTT                  176 non-null    float64
 10  AVG_PUTT_ROUND            176 non-null    float64
 11  AVG_SG_ARG                177 non-null    float64
 12  AVG_SG_APP                177 non-null    float64
 13  AVG_SG_DRIVING            177 non-null    float64
 14  BIRDIE CON

### We notice that some of the columns are of Object datatype
We can edit them so that they are all numerical. At this point, we can just make the entire datframe into the same datatype

In [74]:
# Function to clean all strings and columns
def remove_percent_sign(df):
    for x in df.columns:
        if df[x].dtype == 'object':
            if df[x].str.contains('%').any():
                df[x] = df[x].str.replace('%','',regex = False)
    return df

### Machine Learning Training

In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

features = df[['AVG_DISTANCE']]
target = df[['AVG_SG_DRIVING']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

# Create variables for models
linear_model = LinearRegression()
rf_model = RandomForestRegressor()
svr_model = SVR()

# Train the model
linear_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
svr_model.fit(X_train, y_train)

# Make predictions based on the test set
yLinearPrediction = linear_model.predict(X_test)
yRF = rf_model.predict(X_test)
ySVR = svr_model.predict(X_test)

# Calcate MSE
mseLinear = mean_squared_error(y_test, yLinearPrediction)
mseRF = mean_squared_error(y_test, yRF)
mseSVR = mean_squared_error(y_test, ySVR)

# Print MSE
print(f'Linear MSE: {mseLinear}')
print(f'RF MSE: {mseRF}')
print(f'SVR MSE: {mseSVR}')

Linear MSE: 0.05879264917266634
RF MSE: 0.08725218812554959
SVR MSE: 0.05770879643161042


  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


In [76]:
score = linear_model.score(X_test, y_test)
print(score)

0.45663802799500663
