In [1]:
import pandas as pd
import numpy as np

In [2]:
masters = pd.read_csv('all_data.csv', index_col=0)

In [3]:
masters.columns

Index(['player_name', 'year', 'wgr', 'masters_finish', 'total_score',
       'ranking', 'top_10', 'wins', 'score_average', 'rounds', 'bounce_back',
       'driving_accuracy', 'driving_distance', 'par5_SA', 'gir',
       'hole_proximity', 'putts_round', 'scramble', 'sg_putt', 'sg_t2g',
       'sg_total', 'points_gained', 'm_cut', 'top_25', 'new_score', 'm_play'],
      dtype='object')

In [4]:
from sklearn.preprocessing import MinMaxScaler
def normalize_data(data):
    data = data.values
    data = np.concatenate((data, data ** 2, data ** 3), axis=1)

    scaler = MinMaxScaler().fit(data)
    data = scaler.transform(data)
    return data

In [5]:
masters2019 = pd.read_csv('masters2019.csv')
data2019 = masters2019.drop(['player_name'], axis=1)
normalized = normalize_data(data2019)
names = data2019.columns
new_cols = [n for n in names] + [n + '^2' for n in names] + [n + '^3' for n in names]
data2019 = pd.DataFrame(normalized, columns=new_cols)

In [6]:
masters['avg_score'] = masters['total_score']
for i, row in masters.iterrows():
    if row['m_cut']:
        masters.loc[i, 'avg_score'] /= 4
    else:
        masters.loc[i, 'avg_score'] /= 2
        
masters.drop(masters[masters['avg_score'] < 60].index, inplace=True)

In [7]:
analyze = masters.drop(['player_name', 'year', 'masters_finish', 'total_score', 'ranking', 'hole_proximity',
                        'points_gained', 'm_cut', 'top_25', 'new_score', 'm_play', 'avg_score'], 
                           axis=1)
data = normalize_data(analyze)
shuffle = np.random.permutation(range(len(analyze.index)))
data = data[shuffle]
splitIdx = int(len(shuffle) * .8)
trainData = data[:splitIdx]
testData = data[splitIdx:]

names = analyze.columns
new_cols = [n for n in names] + [n + '^2' for n in names] + [n + '^3' for n in names]
train_X = pd.DataFrame(trainData, columns=new_cols)
test_X = pd.DataFrame(testData, columns=new_cols)
train_data = pd.DataFrame(data, columns=new_cols)

In [8]:
train_labels = masters['avg_score'].values[shuffle][:splitIdx]
train_Y = pd.DataFrame(train_labels, columns=['avg_score'])
test_labels = masters['avg_score'].values[shuffle][splitIdx:]
test_Y = pd.DataFrame(test_labels, columns=['avg_score'])
train_labels = pd.DataFrame(masters['avg_score'].values[shuffle])

In [9]:
from sklearn.linear_model import RidgeCV
reg = RidgeCV()
reg.fit(train_X, train_Y)
pred = reg.predict(test_X)

In [10]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(test_Y, pred) ** 0.5

In [11]:
rmse

2.3722508446200363

In [12]:
reg.fit(train_data, train_labels)
pred2019 = reg.predict(data2019)

In [13]:
predicted_names = masters2019[['player_name', 'wgr']].copy()

In [14]:
predicted_names['pred_score'] = pred2019

In [15]:
sort_pred = predicted_names.sort_values(by='pred_score')

In [16]:
sort_pred[sort_pred['wgr'] <= 15].head(3)

Unnamed: 0,player_name,wgr,pred_score
2,Rory McIlroy,3,70.9109
1,Dustin Johnson,2,71.27545
0,Justin Rose,1,71.614997


In [17]:
sort_pred[(sort_pred['wgr'] > 15) & (sort_pred['wgr'] <= 30)].head(3)

Unnamed: 0,player_name,wgr,pred_score
20,Patrick Cantlay,21,71.855517
26,Sergio Garcia,27,72.18824
21,Phil Mickelson,22,72.283081


In [18]:
sort_pred[sort_pred['wgr'] > 30].head(3)

Unnamed: 0,player_name,wgr,pred_score
35,Matt Wallace,36,72.74318
37,Haotong Li,39,73.130037
47,Si Woo Kim,55,73.173321


In [19]:
results2019 = pd.read_csv("results2019.csv")
results2019.columns = ["position", "player_name", 'total', 'avg_score']

In [20]:
resultsAndPredicted = pd.merge(sort_pred, results2019, on= "player_name")

In [21]:
resultsAndPredicted.head()

Unnamed: 0,player_name,wgr,pred_score,position,total,avg_score
0,Rory McIlroy,3,70.9109,T21,283,70.75
1,Dustin Johnson,2,71.27545,T2,276,69.0
2,Justin Rose,1,71.614997,CUT,148,74.0
3,Justin Thomas,5,71.796443,T12,280,70.0
4,Patrick Cantlay,21,71.855517,T9,278,69.5


In [22]:
final_rmse = mean_squared_error(resultsAndPredicted['avg_score'], resultsAndPredicted['pred_score']) ** 0.5
final_rmse

2.434779105051999