In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import re

In [2]:
df = pd.read_csv('pga_stats.csv', low_memory=False)

In [3]:
df.head(3)

Unnamed: 0,Player Name,Approach__200_yards_RTP_Score_ROUNDS,Approach__200_yards_RTP_Score_AVG RTP,Approaches_100-125_yards-Rgh_RTP_ROUNDS,Approaches_100-125_yards-Rgh_RTP_AVG RTP,Approaches_50-75_yards-Rgh_RTP_ROUNDS,Approaches_50-75_yards-Rgh_RTP_AVG RTP,Approaches_from_125-150_yards_ROUNDS,Approaches_from_125-150_yards_AVG,Approaches_from_125-150_yards_TOTAL DISTANCE (FEET),...,Average_Distance_of_Eagle_putts_made_TOT DIST (FT),Average_Distance_of_Eagle_putts_made_TOT EAGLES PUTTS MADE.,Average_Distance_of_Eagle_putts_made_LONGEST MADE (FT),Average_Distance_of_Eagle_putts_made_SHORTEST MADE (FT),Approach_Putt_Performance_ROUNDS,Approach_Putt_Performance_VALUE,Approach_Putt_Performance_ATTEMPTS,Approach_Putt_Performance_TOTAL DISTANCE (FEET),Putts_made_Distance_EVENTS,Putts_made_Distance_DISTANCE IN INCHES
0,Shane Lowry,52.0,-0.322,52.0,-0.167,52.0,0.2,52.0,"24' 7""",1993.417,...,,,,,,,,,,
1,Nick Watney,75.0,-0.279,75.0,0.081,75.0,0.375,75.0,"23' 0""",3130.167,...,,,,,,,,,,
2,Matt Jones,64.0,-0.27,64.0,0.069,64.0,0.231,64.0,"24' 10""",2806.333,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Columns: 1167 entries, Player Name to Putts_made_Distance_DISTANCE IN INCHES
dtypes: float64(1004), int64(1), object(162)
memory usage: 44.2+ MB


In [5]:
f'Null Values: {df.isna().sum().sum()}'

'Null Values: 4638977'

In [6]:
df = df.dropna(thresh=500)
df = df.dropna(thresh=500, axis=1)

In [None]:
import util
df.loc[:,df.dtypes == object] = df.loc[:,df.dtypes == object].applymap(util.make_numeric)
df.loc[:,df.dtypes == object] = df.loc[:,df.dtypes == object].applymap(lambda df:  0 if df=='E' else df)

In [None]:
df.columns

In [None]:
df = df.fillna(df.mean())

players = df.loc[:,'Player Name']
df = df.drop('Player Name', axis=1)

# Drop Object Columns that did not convert
df = df.drop(df.loc[:,df.dtypes == object], axis=1)

In [None]:
f'Null Values: {df.isna().sum().sum()}'

In [None]:
df.columns

In [None]:
ranking = pd.read_csv(r'data/2018/POINTSRANKINGS/All-Around_Ranking.csv')
ranking['All-Around_Ranking_RANK LAST WEEK'] = ranking['All-Around_Ranking_RANK LAST WEEK'].str.replace('T','').map(int)
ranking.plot(kind='scatter', x='All-Around_Ranking_RANK LAST WEEK', y='All-Around_Ranking_TOTAL', figsize=(16,6));

In [None]:
ranking['All-Around_Ranking_TOTAL'].plot(kind='hist', figsize=(16,6));

In [None]:
target = df[['All-Around_Ranking_TOTAL', 'Year']]
feature = df.drop('All-Around_Ranking_TOTAL', axis=1)

feature_train = feature[~feature['Year'].isin([2018])]
target_train = target[~target['Year'].isin([2018])]['All-Around_Ranking_TOTAL']

feature_test = feature[feature['Year'].isin([2018])]
target_test = target[target['Year'].isin([2018])]['All-Around_Ranking_TOTAL']

rfr = RandomForestRegressor(n_estimators=50, n_jobs=-1)
rfr.fit(feature_train, target_train)
target_predict = rfr.predict(feature_test)

print('Random Forest Results:')
print(f'R Squared: {r2_score(target_test, target_predict)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(target_test, target_predict))}')

target_mean = target_test.map(lambda x: target_test.mean())

print('Mean Model Results:')
print(f'R Squared: {r2_score(target_test, target_mean)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(target_test, target_mean))}')

feature_importance_df = pd.DataFrame({'feature': feature_train.columns, 'importance': rfr.feature_importances_})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

In [None]:
df = feature_importance_df.sort_values('importance', ascending=False).reset_index()

df['importance_normalized'] = df['importance'] / df['importance'].sum()
df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

plt.figure(figsize=(10, 6))
ax = plt.subplot()

ax.barh(list(reversed(list(df.index[:15]))),
        df['importance_normalized'].head(15),
        align='center', edgecolor='k')

ax.set_yticks(list(reversed(list(df.index[:15]))))
ax.set_yticklabels(df['feature'].head(15))

plt.xlabel('Normalized Importance')
plt.title('Feature Importances')
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
plt.xlabel('Number of feature')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.show()

threshold = .7

importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
print('%d feature required for %0.2f of cumulative importance' % (importance_index + 1, threshold))