# Feature Creation for Predictive Models

## Import Existing Data

In [1]:
import os
from IPython.display import display

os.chdir('../data_collection')

In [2]:
import pandas as pd
import pymongo
from connect import CONN

# connect to database
client = pymongo.MongoClient(CONN)
db = client['strokes-gained']
print(db.list_collection_names())

['tournaments', 'tournament_clusters', 'players']


I already have clusters based on the specific historic tournament performance but, I still need to create information about the players.

In [3]:
# first create dataframe witht he players data
players = pd.DataFrame(db['players'].find({}, {'_id': 0}))
players.head()

Unnamed: 0,pos,player_name,score,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,tournament_id
0,1,Adam Svensson,-19,3.08,0.28,0.8,0.96,2.04,3.15,1
1,T2,Callum Tarren,-17,1.0,0.33,1.9,0.15,2.38,2.86,1
2,T2,Sahith Theegala,-17,2.19,0.37,-0.39,0.62,0.6,2.65,1
3,T2,Brian Harman,-17,0.86,0.15,1.31,0.13,1.59,2.65,1
4,T5,Joel Dahmen,-15,0.73,-0.1,-0.03,0.79,0.65,2.36,1


In [4]:
# lets look at averages for strokes gained by player
player_avg = players.iloc[:, 1:-1].groupby('player_name').mean().drop('score', axis=1).reset_index()
player_avg.columns = [i if idx == 0 else i + '_avg' for idx, i in enumerate(player_avg.columns)]

print(player_avg.columns)

Index(['player_name', 'sg_putt_avg', 'sg_arg_avg', 'sg_app_avg', 'sg_ott_avg',
       'sg_t2g_avg', 'sg_total_avg'],
      dtype='object')


Lets look at the impact these stats can have on a players performance at a particular tournament 

In [5]:
import statsmodels.api as sm

# add new data to dataframe
updated_players = players.merge(player_avg, on='player_name').drop(['tournament_id'], axis=1)
# print(updated_players)

# perform linear regression to look at new variable significance
x = updated_players.iloc[:, 2:].copy().dropna(axis=0)
y = x.pop('score')

x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.pvalues

const           0.000000
sg_putt         0.294069
sg_arg          0.044428
sg_app          0.043094
sg_ott          0.043060
sg_t2g          0.041599
sg_total        0.000000
sg_putt_avg     0.156839
sg_arg_avg      0.072732
sg_app_avg      0.070751
sg_ott_avg      0.071555
sg_t2g_avg      0.073897
sg_total_avg    0.473225
dtype: float64

This shows how putting can often be over valued in terms of measuring a players performance. I would not say putting is not important because, of my relevant experience in tournament golf, but other factors are more informaitive. This aligns with claims made by Mark Broadie in Every Shot Counts. 

## Group Players by their averages

In [6]:
from scipy.cluster.vq import kmeans
from scipy.spatial.distance import euclidean
import numpy as np

# data to cluster
display(player_avg.head())

i = 1
for name in ['putting', 'arg', 'app', 'ott', 't2g']:
    # create clusters
    clusters, score = kmeans(player_avg.iloc[:, i].values, 5)
    np.save(f'{name}_clusters.npy', clusters)

    # assign clusters
    stat = player_avg.iloc[:, 1].values.reshape(-1, 1)
    clusters = clusters.reshape(-1, 1)

    best_value = []
    for value in stat:
        distances = []
        for cluster in clusters:
            distances.append(euclidean(cluster, value))

        best_value.append(distances.index(min(distances)))

    cluster = np.array(best_value)

    # create new columns
    player_avg[name] = cluster
    i += 1

# show updated df
display(player_avg.head())

Unnamed: 0,player_name,sg_putt_avg,sg_arg_avg,sg_app_avg,sg_ott_avg,sg_t2g_avg,sg_total_avg
0,A.J. Ewart,-0.36,0.88,-0.57,-0.88,-0.57,-0.94
1,A.J. McInerney,-3.7,0.53,1.487,0.811,2.83,-0.87
2,Aaron Baddeley,0.154075,0.321288,-0.215712,-0.749537,-0.643563,-0.42235
3,Aaron Beverly,-3.53,-1.89,-2.12,0.25,-3.76,-7.29
4,Aaron Cockerill,0.58,-0.075,-0.17,-0.22,-0.46,0.11


Unnamed: 0,player_name,sg_putt_avg,sg_arg_avg,sg_app_avg,sg_ott_avg,sg_t2g_avg,sg_total_avg,putting,arg,app,ott,t2g
0,A.J. Ewart,-0.36,0.88,-0.57,-0.88,-0.57,-0.94,1,4,1,3,3
1,A.J. McInerney,-3.7,0.53,1.487,0.811,2.83,-0.87,3,3,4,4,4
2,Aaron Baddeley,0.154075,0.321288,-0.215712,-0.749537,-0.643563,-0.42235,0,2,0,0,2
3,Aaron Beverly,-3.53,-1.89,-2.12,0.25,-3.76,-7.29,3,3,4,4,4
4,Aaron Cockerill,0.58,-0.075,-0.17,-0.22,-0.46,0.11,0,0,0,1,2


In [7]:
# show spread of clusters
for i in range(-5, 0):
    print(player_avg.iloc[:, i].value_counts())

1    433
0    341
2    260
4     99
3     87
Name: putting, dtype: int64
4    345
1    298
2    265
3    173
0    139
Name: arg, dtype: int64
1    510
0    371
2    236
3     76
4     27
Name: app, dtype: int64
0    400
3    320
2    219
1    201
4     80
Name: ott, dtype: int64
3    625
2    305
0    237
4     52
1      1
Name: t2g, dtype: int64


### Look at how these clusters tell a story

In [8]:
# creating data for the model to see how these cluster can improve performance of model
player_data = players.drop(['tournament_id', 'score'], axis=1).copy()
data = player_avg.merge(player_data, on='player_name').dropna(axis=0)

# creating our binary column for prediction
data['prediction'] = data['pos'].apply(lambda x: 0 if x == 'CUT' else 1)
data.head()

Unnamed: 0,player_name,sg_putt_avg,sg_arg_avg,sg_app_avg,sg_ott_avg,sg_t2g_avg,sg_total_avg,putting,arg,app,ott,t2g,pos,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,prediction
0,A.J. Ewart,-0.36,0.88,-0.57,-0.88,-0.57,-0.94,1,4,1,3,3,CUT,-0.36,0.88,-0.57,-0.88,-0.57,-0.94,0
1,A.J. McInerney,-3.7,0.53,1.487,0.811,2.83,-0.87,3,3,4,4,4,CUT,-3.7,0.53,1.487,0.811,2.83,-0.87,0
2,Aaron Baddeley,0.154075,0.321288,-0.215712,-0.749537,-0.643563,-0.42235,0,2,0,0,2,T39,0.72,0.5,-0.21,-0.55,-0.26,0.65,1
3,Aaron Baddeley,0.154075,0.321288,-0.215712,-0.749537,-0.643563,-0.42235,0,2,0,0,2,CUT,-0.56,0.46,0.59,-2.19,-1.14,-2.85,0
4,Aaron Baddeley,0.154075,0.321288,-0.215712,-0.749537,-0.643563,-0.42235,0,2,0,0,2,T18,1.45,1.13,-0.64,-0.51,-0.02,1.44,1


In [9]:
data.shape

(23689, 20)

In [10]:
# split data into x and y for first model
x1 = data.copy().iloc[:, 12:-1].drop(['pos', 'sg_total'], axis=1)
y1 = data.copy().iloc[:, -1]

print(x1.columns)

# data for second model
x2 = data.copy().iloc[:, 7:-1].drop(['pos', 'sg_total'], axis=1)
y2 = y1.copy()

print(x2.columns)

Index(['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g'], dtype='object')
Index(['putting', 'arg', 'app', 'ott', 't2g', 'sg_putt', 'sg_arg', 'sg_app',
       'sg_ott', 'sg_t2g'],
      dtype='object')


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# create function to test models
def create_model(x, y):
    model = RandomForestClassifier(random_state=42)
    xtr, xte, ytr, yte = train_test_split(x, y, test_size=0.33, random_state=42)

    model.fit(xtr, ytr)
    cm = confusion_matrix(yte, model.predict(xte))
    return cm, model.score(xte, yte)

This was successful because adding information about players and their averages marginally improved prediction performance.

In [12]:
print(create_model(x1, y1))
print(create_model(x2, y2))

(array([[3013,  608],
       [ 807, 3390]]), 0.8190074187771809)
(array([[3018,  603],
       [ 782, 3415]]), 0.8228447173190074)


## Adding Information about golf courses

In [13]:
# data about the tournaments on the pga tour
tournaments = pd.read_csv('../data/tournament_data.csv')

## let's create dummie variables for location
# first sort the data by state name
tournaments = tournaments.sort_values('location')
names = sorted(tournaments.location.unique())

# create dummies
dummies = pd.get_dummies(tournaments['location'])
dummies.columns = names

# adding columns to data
tournaments = pd.concat([tournaments, dummies], axis=1)
print(tournaments.shape)

(173, 31)


In [14]:
tournaments_players = tournaments.merge(players, on='tournament_id')
tournaments_players.shape, tournaments_players.columns

((23703, 40),
 Index(['tournament_id', 'name', 'year', 'location', 'Alabama', 'Arizona',
        'California', 'Canada', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
        'Hawaii', 'Illinois', 'Kentucky', 'Maryland', 'Massachusetts', 'Mexico',
        'Michigan', 'Minnesota', 'Missouri', 'New Jersey', 'New York',
        'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Scotland',
        'South Carolina', 'Tennessee', 'Texas', 'pos', 'player_name', 'score',
        'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total'],
       dtype='object'))

In [15]:
cols = ['tournament_id', 'player_name', 'score', 'Alabama', 'Arizona',
        'California', 'Canada', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
        'Hawaii', 'Illinois', 'Kentucky', 'Maryland', 'Massachusetts', 'Mexico',
        'Michigan', 'Minnesota', 'Missouri', 'New Jersey', 'New York',
        'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Scotland',
        'South Carolina', 'Tennessee', 'Texas', 
        'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 'pos']

cols2 = ['player_name', 'putting', 'arg', 'app', 'ott', 't2g']

display(tournaments_players[cols].head())

df = tournaments_players[cols].merge(player_avg[cols2], on='player_name')
display(df.head())

Unnamed: 0,tournament_id,player_name,score,Alabama,Arizona,California,Canada,Connecticut,Delaware,Florida,...,South Carolina,Tennessee,Texas,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,pos
0,147,Grayson Murray,-21,1,0,0,0,0,0,0,...,0,0,0,0.387,0.064,1.417,1.518,3.0,3.387,1
1,147,Chad Collins,-20,1,0,0,0,0,0,0,...,0,0,0,1.429,0.107,1.668,-0.067,1.707,3.137,2
2,147,Brian Gay,-19,1,0,0,0,0,0,0,...,0,0,0,1.017,0.944,1.093,-0.167,1.87,2.887,T3
3,147,Scott Stallings,-19,1,0,0,0,0,0,0,...,0,0,0,-0.187,0.165,1.887,1.022,3.074,2.887,T3
4,147,Tag Ridings,-19,1,0,0,0,0,0,0,...,0,0,0,2.118,0.414,-0.212,0.566,0.768,2.886,T3


Unnamed: 0,tournament_id,player_name,score,Alabama,Arizona,California,Canada,Connecticut,Delaware,Florida,...,sg_app,sg_ott,sg_t2g,sg_total,pos,putting,arg,app,ott,t2g
0,147,Grayson Murray,-21,1,0,0,0,0,0,0,...,1.417,1.518,3.0,3.387,1,1,1,1,3,3
1,71,Grayson Murray,5,0,1,0,0,0,0,0,...,-0.309,0.254,-0.58,-1.987,T70,1,1,1,3,3
2,70,Grayson Murray,-1,0,1,0,0,0,0,0,...,-0.365,0.261,0.546,-0.235,59,1,1,1,3,3
3,69,Grayson Murray,-1,0,1,0,0,0,0,0,...,0.234,0.757,0.423,-0.535,T55,1,1,1,3,3
4,68,Grayson Murray,-8,0,1,0,0,0,0,0,...,0.739,0.308,0.639,0.307,T42,1,1,1,3,3


Lets try to build model with new data

In [16]:
df.columns

Index(['tournament_id', 'player_name', 'score', 'Alabama', 'Arizona',
       'California', 'Canada', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Illinois', 'Kentucky', 'Maryland', 'Massachusetts', 'Mexico',
       'Michigan', 'Minnesota', 'Missouri', 'New Jersey', 'New York',
       'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Scotland',
       'South Carolina', 'Tennessee', 'Texas', 'sg_putt', 'sg_arg', 'sg_app',
       'sg_ott', 'sg_t2g', 'sg_total', 'pos', 'putting', 'arg', 'app', 'ott',
       't2g'],
      dtype='object')

In [17]:
# adding the binary predictor value
df['result'] = df['pos'].apply(lambda x: 0 if x == 'CUT' else 1)

# splitting data
x = df.copy().dropna(axis=0).iloc[:, 3:]
x = x.drop('pos', axis=1)

y = x.pop('result')

# look at columns
x.columns, y.name

(Index(['Alabama', 'Arizona', 'California', 'Canada', 'Connecticut', 'Delaware',
        'Florida', 'Georgia', 'Hawaii', 'Illinois', 'Kentucky', 'Maryland',
        'Massachusetts', 'Mexico', 'Michigan', 'Minnesota', 'Missouri',
        'New Jersey', 'New York', 'North Carolina', 'Ohio', 'Oklahoma',
        'Pennsylvania', 'Scotland', 'South Carolina', 'Tennessee', 'Texas',
        'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total',
        'putting', 'arg', 'app', 'ott', 't2g'],
       dtype='object'),
 'result')

In [18]:
create_model(x, y)

(array([[3259,  358],
        [ 747, 3454]]),
 0.8586595037093886)

## Cluster tournaments

In [19]:
by_tournament = df.groupby('tournament_id')[['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g']].mean()
by_tournament.head()

Unnamed: 0_level_0,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g
tournament_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-0.211842,-0.054013,-0.082237,-0.165132,-0.300855
2,-0.188526,-0.087885,-0.079936,-0.134231,-0.302372
3,-0.107561,-0.088211,-0.130163,-0.037642,-0.255935
4,-0.082809,-0.089213,-0.201685,-0.114494,-0.405169
5,-0.105282,-0.020141,-0.164437,-0.030845,-0.217958


In [20]:
# clustering based on averages of golf courses
from scipy.cluster.vq import kmeans
from scipy.spatial.distance import euclidean

# create clusters
clusters, _ = kmeans(by_tournament.values, 5)
np.save('course_clusters.npy', clusters)
# print(clusters)

# assign obeservations to clusters
cluster_val = []

for idx, row in by_tournament.iterrows():
    distances = [euclidean(cluster, row.values) for cluster in clusters]
    cluster_val.append(distances.index(min(distances)))

# adding data back to dataframe
by_tournament['course_cluster'] = cluster_val
print(by_tournament.shape)

(173, 6)


In [21]:
## new training data
# fix column names
by_tournament.columns = ['tournament_putt', 'tournament_arg', 'tournament_app', 'tournament_ott', 'tournament_t2g', 'tournament_cluster']

# merge
new_df = df.merge(by_tournament.reset_index(), on='tournament_id')
new_df.columns

Index(['tournament_id', 'player_name', 'score', 'Alabama', 'Arizona',
       'California', 'Canada', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Illinois', 'Kentucky', 'Maryland', 'Massachusetts', 'Mexico',
       'Michigan', 'Minnesota', 'Missouri', 'New Jersey', 'New York',
       'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Scotland',
       'South Carolina', 'Tennessee', 'Texas', 'sg_putt', 'sg_arg', 'sg_app',
       'sg_ott', 'sg_t2g', 'sg_total', 'pos', 'putting', 'arg', 'app', 'ott',
       't2g', 'result', 'tournament_putt', 'tournament_arg', 'tournament_app',
       'tournament_ott', 'tournament_t2g', 'tournament_cluster'],
      dtype='object')

In [22]:
## lets look at benefit from only looking at tournament stats as a whole
# splitting data
x = new_df.copy().dropna(axis=0).iloc[:, 3:]
x = x.drop('pos', axis=1)

y = x.pop('result')

# run model
create_model(x, y)

(array([[3340,  274],
        [ 714, 3490]]),
 0.8736249680225121)

In [23]:
new_df.head()

Unnamed: 0,tournament_id,player_name,score,Alabama,Arizona,California,Canada,Connecticut,Delaware,Florida,...,app,ott,t2g,result,tournament_putt,tournament_arg,tournament_app,tournament_ott,tournament_t2g,tournament_cluster
0,147,Grayson Murray,-21,1,0,0,0,0,0,0,...,1,3,3,1,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,3
1,147,Chad Collins,-20,1,0,0,0,0,0,0,...,1,3,3,1,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,3
2,147,Brian Gay,-19,1,0,0,0,0,0,0,...,0,0,2,1,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,3
3,147,Scott Stallings,-19,1,0,0,0,0,0,0,...,1,0,3,1,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,3
4,147,Tag Ridings,-19,1,0,0,0,0,0,0,...,1,3,3,1,-0.125295,-0.053159,-0.217258,-0.091008,-0.361742,3


In [24]:

# exporting data
new_df.to_pickle("../ml/training_data.pkl")