## Import data

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# read in data
data = pd.read_csv('/Users/andrejacobs/Desktop/spring 2023/499/strokes_gained/ml/training_data.csv')
data.columns

Index(['tournament_id', 'player_name', 'score', 'Alabama', 'Arizona',
       'California', 'Canada', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Illinois', 'Kentucky', 'Maryland', 'Massachusetts', 'Mexico',
       'Michigan', 'Minnesota', 'Missouri', 'New Jersey', 'New York',
       'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Scotland',
       'South Carolina', 'Tennessee', 'Texas', 'sg_putt', 'sg_arg', 'sg_app',
       'sg_ott', 'sg_t2g', 'sg_total', 'pos', 'putting', 'arg', 'app', 'ott',
       't2g', 'result', 'tournament_putt', 'tournament_arg', 'tournament_app',
       'tournament_ott', 'tournament_t2g', 'tournament_cluster'],
      dtype='object')

In [23]:
def train_and_eval(data: pd.DataFrame):
    info = dict()

    data['result'] = data.pos.apply(lambda x: 1 if x == '1' else 0)

    x = data.copy().dropna().drop('pos', axis=1)
    y = x.pop('result')
    xtr, xte, ytr, yte = train_test_split(x, y, random_state=42, test_size=0.33)

    trees = RandomForestClassifier(random_state=42)
    trees.fit(xtr, ytr)
    info['features'] = trees.feature_names_in_

    info['score'] = trees.score(xte, yte)
    pred = trees.predict(xte)
    info['confusion'] = confusion_matrix(yte, pred)

    return info

## Model 1 (Just Strokes Gained)

In [24]:
# getting data needed for first model
one = data.copy()[['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 'pos']]

# display data
one.head()

Unnamed: 0,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,pos
0,0.387,0.064,1.417,1.518,3.0,3.387,1
1,1.429,0.107,1.668,-0.067,1.707,3.137,2
2,1.017,0.944,1.093,-0.167,1.87,2.887,T3
3,-0.187,0.165,1.887,1.022,3.074,2.887,T3
4,2.118,0.414,-0.212,0.566,0.768,2.886,T3


In [25]:
train_and_eval(one)

{'features': array(['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total'],
       dtype=object),
 'score': 0.9938603223330775,
 'confusion': array([[7746,    6],
        [  42,   24]])}

## Model 2 (SG + Tournament SG) BAD NO IMPROVEMENT

In [26]:
# getting data for second model
two = data.copy()[['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 
       'tournament_putt', 'tournament_arg', 'tournament_app', 'tournament_ott', 
       'tournament_t2g', 'pos']]

# display data
two.head()

Unnamed: 0,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,tournament_putt,tournament_arg,tournament_app,tournament_ott,tournament_t2g,pos
0,0.387,0.064,1.417,1.518,3.0,3.387,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,1
1,1.429,0.107,1.668,-0.067,1.707,3.137,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,2
2,1.017,0.944,1.093,-0.167,1.87,2.887,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,T3
3,-0.187,0.165,1.887,1.022,3.074,2.887,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,T3
4,2.118,0.414,-0.212,0.566,0.768,2.886,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,T3


In [27]:
train_and_eval(two)

{'features': array(['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total',
        'tournament_putt', 'tournament_arg', 'tournament_app',
        'tournament_ott', 'tournament_t2g'], dtype=object),
 'score': 0.9936045024302891,
 'confusion': array([[7744,    8],
        [  42,   24]])}

## Model 3 (SG + Player Clusters)

In [29]:
# picking columns
three = data.copy()[['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 
                     'pos', 'putting', 'arg', 'app', 'ott', 't2g']]

# display data
three.head()

Unnamed: 0,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,pos,putting,arg,app,ott,t2g
0,0.387,0.064,1.417,1.518,3.0,3.387,1,1,1,1,3,3
1,1.429,0.107,1.668,-0.067,1.707,3.137,2,1,4,1,3,3
2,1.017,0.944,1.093,-0.167,1.87,2.887,T3,0,2,0,0,2
3,-0.187,0.165,1.887,1.022,3.074,2.887,T3,1,4,1,0,3
4,2.118,0.414,-0.212,0.566,0.768,2.886,T3,1,4,1,3,3


In [30]:
train_and_eval(three)

{'features': array(['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total',
        'putting', 'arg', 'app', 'ott', 't2g'], dtype=object),
 'score': 0.9932207725761064,
 'confusion': array([[7742,   10],
        [  43,   23]])}

## Model 4 (SG + Tournament Cluster)

In [35]:
# last hope lol...
four = data.copy()[['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total',
                    'tournament_cluster', 'pos']]

# display data
four.head()

Unnamed: 0,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,tournament_cluster,pos
0,0.387,0.064,1.417,1.518,3.0,3.387,3,1
1,1.429,0.107,1.668,-0.067,1.707,3.137,3,2
2,1.017,0.944,1.093,-0.167,1.87,2.887,3,T3
3,-0.187,0.165,1.887,1.022,3.074,2.887,3,T3
4,2.118,0.414,-0.212,0.566,0.768,2.886,3,T3


In [36]:
train_and_eval(four)

{'features': array(['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total',
        'tournament_cluster'], dtype=object),
 'score': 0.9939882322844718,
 'confusion': array([[7747,    5],
        [  42,   24]])}

None of the feature I added to try to capture golf course conditions made an impact on predicting winners.