In [66]:
import pandas as pd
from sklearn.linear_model import Ridge

# Looking at our combined dataframe - extracting the best shooters

In [67]:
stats_dataframe = pd.read_csv('../data/full_player_data_1991-2022.csv')
snipers = stats_dataframe.loc[(stats_dataframe['3P%'] > 0.45) 
                              & (stats_dataframe['G'] > 65)
                                & (stats_dataframe['Year'] > 2010)
                                & (stats_dataframe['3PA'] >= 4.0)    
                            ]
snipers = snipers[['Player','Year','3P%','3PA','TRB','AST','PTS']]
snipers['TOTALS'] = snipers['PTS'] + snipers['TRB'] + snipers['AST']
snipers.sort_values(by=['TOTALS'], ascending=False,inplace=True)
snipers

Unnamed: 0,Player,Year,3P%,3PA,TRB,AST,PTS,TOTALS
5824,Stephen Curry,2016,0.454,11.2,5.4,6.7,30.1,42.2
6983,Stephen Curry,2013,0.453,7.7,4.0,6.9,22.9,33.8
11864,Joe Ingles,2021,0.451,6.1,3.6,4.7,12.1,20.4
3565,Joe Harris,2019,0.474,5.1,3.8,2.4,13.7,19.9
4388,Joe Harris,2021,0.475,6.4,3.6,1.9,14.1,19.6
10029,JJ Redick,2016,0.475,5.6,1.9,1.4,16.3,19.6
2665,Kyle Korver,2014,0.472,5.5,4.0,2.9,12.0,18.9
2680,Kyle Korver,2015,0.492,6.0,4.1,2.6,12.1,18.8
2652,Kyle Korver,2013,0.457,5.6,4.0,2.0,10.9,16.9
13636,Danny Green,2019,0.455,5.4,4.0,1.6,10.3,15.9


# Clean the data once more - check for and remove any null or missing values

In [68]:
# Check for any null data
pd.isnull(stats_dataframe).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [69]:
null_shooters : pd.DataFrame = stats_dataframe[pd.isnull(stats_dataframe['3P%'])]
null_shooters[['3PA']].sum()

3PA    0.0
dtype: float64

In [70]:
# The null values are only in the shooting % columns 
# Players who attempted 0 shots have null shooting percentages 
# Replace any missing or null values with 0  
stats_dataframe.fillna(0, inplace=True)
pd.isnull(stats_dataframe).sum()


Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        0
3P         0
3PA        0
3P%        0
2P         0
2PA        0
2P%        0
eFG%       0
FT         0
FTA        0
FT%        0
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
Year       0
Pts Won    0
Pts Max    0
Share      0
Team       0
W          0
L          0
W/L%       0
GB         0
PS/G       0
PA/G       0
SRS        0
dtype: int64

# Use the data for machine learning to predict MVP voting

## Extract Features from the data

In [71]:
columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']
# We are trying to predict MVP Voting - use all of the numeric columns as predictors 
prediction_features = [
    'G', 'GS',  'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', 
    '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB','DRB', 'TRB', 'AST', 'STL', 
    'BLK', 'TOV', 'PF', 'PTS', 'Year','W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G',
      'SRS'
]
target = 'Share'


In [72]:
# Use the data from every NBA season 1991 -2022 as training data
training_df = stats_dataframe[stats_dataframe['Year'] < 2022]
# We want to predict the 2022 NBA MVP - use this as our testing data
testing_df = stats_dataframe[stats_dataframe['Year'] == 2022]

In [73]:
# Use a Ridge Regression - similar to lin reg, but shirnks coefficients to avoid overfitting
regressor_model = Ridge(alpha=.1)

In [74]:
# Fit our model 
regressor_model.fit(training_df[prediction_features], training_df[target])

In [75]:
predictions = regressor_model.predict(testing_df[prediction_features])
predictions_df = pd.DataFrame(predictions, columns=['predictions'], index=testing_df.index)

In [76]:
mvp_votes_with_preds_df = pd.concat(
    [testing_df[['Player','Share']], predictions_df], axis=1
)
mvp_votes_with_preds_df

Unnamed: 0,Player,Share,predictions
648,Aaron Gordon,0.0,0.012752
649,Austin Rivers,0.0,-0.028979
650,Bol Bol,0.0,-0.005406
651,Bones Hyland,0.0,0.018208
652,Bryn Forbes,0.0,-0.005492
...,...,...,...
12508,Micah Potter,0.0,-0.018988
12509,Rodney McGruder,0.0,-0.011656
12510,Saben Lee,0.0,0.004580
12511,Saddiq Bey,0.0,0.002109


In [77]:
# Let's see who our algorithm predicted to win MVP
mvp_votes_with_preds_df.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
663,Nikola Jokić,0.875,0.190795
837,Joel Embiid,0.706,0.190147
11678,Giannis Antetokounmpo,0.595,0.218751
907,Devin Booker,0.216,0.091858
11469,Luka Dončić,0.146,0.15862
1179,Jayson Tatum,0.043,0.09688
12226,Ja Morant,0.01,0.120924
6398,Stephen Curry,0.004,0.091383
905,Chris Paul,0.002,0.076588
8241,LeBron James,0.001,0.154235


# Choose an Error Metric & Assessing Accuracy

In [78]:
from sklearn.metrics import mean_squared_error
mean_squared_error(mvp_votes_with_preds_df['Share'], mvp_votes_with_preds_df['predictions'])


0.002234328777636906

In [79]:
# Our model performs well  when considering all players, however most players
# (over 99%) in the NBA don't receivee any votes at all anyway. 
# Let's look at our model's performance with the top players 
mvp_votes_with_preds_df['Share'].value_counts()

Share
0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: count, dtype: int64

In [84]:
# Let's adjust our dataframe to add a Rank column for the actual MVP rank 
mvp_votes_with_preds_df = mvp_votes_with_preds_df.sort_values('Share', ascending=False)
mvp_votes_with_preds_df['Actual Rank'] = list(range(1, mvp_votes_with_preds_df.shape[0] + 1))
mvp_votes_with_preds_df.head(10)

Unnamed: 0,Player,Share,predictions,Actual Rank
663,Nikola Jokić,0.875,0.190795,1
837,Joel Embiid,0.706,0.190147,2
11678,Giannis Antetokounmpo,0.595,0.218751,3
907,Devin Booker,0.216,0.091858,4
11469,Luka Dončić,0.146,0.15862,5
1179,Jayson Tatum,0.043,0.09688,6
12226,Ja Morant,0.01,0.120924,7
6398,Stephen Curry,0.004,0.091383,8
905,Chris Paul,0.002,0.076588,9
8241,LeBron James,0.001,0.154235,10
