In [141]:
import pandas as pd

In [142]:
stats = pd.read_csv('data/player_mvp_stats.csv')

In [143]:
pd.isnull(stats).sum()

Player         0
Age            0
Team           0
Pos            0
G              0
GS             0
MP             0
FG             0
FGA            0
FG%           65
3P             0
3PA            0
3P%         2134
2P             0
2PA            0
2P%          110
eFG%          65
FT             0
FTA            0
FT%          580
ORB            0
DRB            0
TRB            0
AST            0
STL            0
BLK            0
TOV            0
PF             0
PTS            0
Awards     13964
Year           0
Pts Won        0
Pts Max        0
Share          0
W              0
L              0
W/L%           0
GB             0
PS/G           0
PA/G           0
SRS            0
dtype: int64

In [144]:
stats = stats.fillna(0)

In [145]:
stats.dtypes

Player      object
Age          int64
Team        object
Pos         object
G            int64
GS           int64
MP         float64
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
2P         float64
2PA        float64
2P%        float64
eFG%       float64
FT         float64
FTA        float64
FT%        float64
ORB        float64
DRB        float64
TRB        float64
AST        float64
STL        float64
BLK        float64
TOV        float64
PF         float64
PTS        float64
Awards      object
Year         int64
Pts Won    float64
Pts Max    float64
Share      float64
W            int64
L            int64
W/L%       float64
GB         float64
PS/G       float64
PA/G       float64
SRS        float64
dtype: object

In [146]:
stats.columns

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards',
       'Year', 'Pts Won', 'Pts Max', 'Share', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [147]:
# Remove strings and elements to predict
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [148]:
train = stats[stats["Year"] < 2024]
test = stats[stats["Year"] == 2024]

In [149]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [150]:
reg = Ridge(alpha=0.2)

In [151]:
reg.fit(train[predictors], train["Share"])

In [152]:
prediction = pd.DataFrame(reg.predict(test[predictors]), columns=["Predicted Share"], index=test.index)

In [153]:
prediction.head()

Unnamed: 0,Predicted Share
229,0.000357
230,-0.019613
231,0.021401
232,0.007396
233,-0.013766


In [154]:
combined = pd.concat([test[["Player", "Share"]], prediction], axis=1)

In [155]:
combined[combined["Share"] > 0.1]

Unnamed: 0,Player,Share,Predicted Share
236,Giannis Antetokounmpo,0.194,0.212296
346,Luka Dončić,0.572,0.188664
811,Nikola Jokić,0.935,0.172789
1449,Shai Gilgeous-Alexander,0.646,0.168487
4086,Jalen Brunson,0.143,0.099097


In [156]:
mean_squared_error(combined["Share"], combined["Predicted Share"])

0.0024749870301202528

In [158]:
actual = combined.sort_values("Share", ascending=False)
predicted = combined.sort_values("Predicted Share", ascending=False)
actual["Rank"] = list(range(1,actual.shape[0]+1))
predicted["Predicted Rank"] = list(range(1,predicted.shape[0]+1))

In [159]:
actual.head()

Unnamed: 0,Player,Share,Predicted Share,Rank
811,Nikola Jokić,0.935,0.172789,1
1449,Shai Gilgeous-Alexander,0.646,0.168487,2
346,Luka Dončić,0.572,0.188664,3
236,Giannis Antetokounmpo,0.194,0.212296,4
4086,Jalen Brunson,0.143,0.099097,5


In [160]:
# Sort values by most predicted Share
combined = combined.sort_values(by="Predicted Share", ascending=False)
combined["Predicted Rank"] = list((range(1, len(combined) + 1)))
combined.head()

Unnamed: 0,Player,Share,Predicted Share,Predicted Rank
236,Giannis Antetokounmpo,0.194,0.212296,1
14107,Joel Embiid,0.0,0.206347,2
346,Luka Dončić,0.572,0.188664,3
811,Nikola Jokić,0.935,0.172789,4
1449,Shai Gilgeous-Alexander,0.646,0.168487,5


In [161]:
combined.head()

Unnamed: 0,Player,Share,Predicted Share,Predicted Rank
236,Giannis Antetokounmpo,0.194,0.212296,1
14107,Joel Embiid,0.0,0.206347,2
346,Luka Dončić,0.572,0.188664,3
811,Nikola Jokić,0.935,0.172789,4
1449,Shai Gilgeous-Alexander,0.646,0.168487,5


In [163]:
combined.sort_values(by="Share", ascending=False).head(10)

Unnamed: 0,Player,Share,Predicted Share,Predicted Rank
811,Nikola Jokić,0.935,0.172789,4
1449,Shai Gilgeous-Alexander,0.646,0.168487,5
346,Luka Dončić,0.572,0.188664,3
236,Giannis Antetokounmpo,0.194,0.212296,1
4086,Jalen Brunson,0.143,0.099097,12
3078,Jayson Tatum,0.087,0.113793,8
8965,Anthony Edwards,0.018,0.088288,17
4513,Domantas Sabonis,0.003,0.09725,13
12754,Kevin Durant,0.001,0.102151,10
4181,Leaky Black,0.0,-0.00989,393


In [164]:
def find_ap(df):
    actual = df.sort_values(by="Share", ascending=False).head()
    predicted = df.sort_values(by="Predicted Share", ascending=False)
    ps= []
    found = 0
    seen = 1

    for i, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen+=1
    return sum(ps)/len(ps)

In [165]:
find_ap(combined)

0.7266666666666667

In [166]:
years = list(range(1991, 2025))

In [177]:
def add_ranks(combined):
    combined = combined.sort_values(by="Share", ascending=False)
    combined["Rank"] = list((range(1, len(combined) + 1)))
    combined = combined.sort_values(by="Predicted Share", ascending=False)
    combined["Predicted Rank"] = list((range(1, len(combined) + 1)))
    combined['Difference'] = combined["Rank"] - combined["Predicted Rank"]
    return combined

In [178]:
rankings = add_ranks(all_predictions[1]).sort_values(by="Difference", ascending=False)
rankings[rankings["Rank"] < 11]

Unnamed: 0,Player,Share,Predicted Share,Rank,Predicted Rank,Difference
14160,Shaquille O'Neal,0.006,0.198205,10,1,9
13518,Hakeem Olajuwon,0.083,0.136317,7,4,3
4932,Patrick Ewing,0.05,0.127956,8,7,1
11771,Michael Jordan,0.832,0.167594,2,3,-1
1943,Karl Malone,0.857,0.192113,1,2,-1
1133,Grant Hill,0.327,0.128691,3,6,-3
10194,Gary Payton,0.091,0.093524,6,10,-4
5411,Tim Hardaway,0.207,0.060319,4,20,-16
9376,Anthony Mason,0.006,0.052925,9,27,-18
9379,Glen Rice,0.117,0.033395,5,53,-48


In [179]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["Predicted Share"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [180]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [181]:
mean_ap

0.7199971275432467

In [182]:
all_predictions.head()

Unnamed: 0,Player,Share,Predicted Share,Rank,Predicted Rank,Difference
8245,Shaquille O'Neal,0.056,0.213851,10,1,9
11039,David Robinson,0.508,0.21131,2,2,0
5711,Hakeem Olajuwon,0.211,0.204615,4,3,1
1933,Karl Malone,0.075,0.181586,7,4,3
11757,Michael Jordan,0.986,0.174356,1,5,-4


In [183]:
all_predictions[all_predictions["Rank"] <= 5].sort_values("Difference").head()

Unnamed: 0,Player,Share,Predicted Share,Rank,Predicted Rank,Difference
1567,Jason Kidd,0.712,0.028455,2,52,-50
9379,Glen Rice,0.117,0.033395,5,53,-48
5934,Steve Nash,0.839,0.034067,1,46,-45
9647,Peja Stojaković,0.228,0.036813,4,38,-34
5952,Steve Nash,0.739,0.05424,1,34,-33


In [184]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.117762,eFG%
29,0.029797,W/L%
18,0.029687,DRB
17,0.016143,ORB
10,0.015175,2P
21,0.012395,STL
15,0.010367,FTA
22,0.010235,BLK
12,0.010101,2P%
25,0.007521,PTS


In [185]:
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [97]:
stat_ratios.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,PTS,AST,STL,BLK,3P,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1991,0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1991,1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
1991,2,0.311795,0.093492,0.274608,1.571429,0.0,1.0
1991,3,0.20044,0.186984,0.274608,0.0,0.0,1.0
1991,4,2.383005,1.63611,1.78495,0.897959,1.52576,1.0
1991,5,0.322931,1.16865,0.411912,0.0,1.52576,1.0
1991,6,2.160294,5.843249,1.78495,0.44898,5.085865,1.0
1991,7,0.445421,0.140238,0.411912,0.673469,0.0,1.0
1991,8,1.503297,0.70119,1.235735,2.469388,1.017173,1.0
1991,9,1.102418,0.46746,0.549215,0.22449,0.0,1.0


In [187]:
stats[["Player", "Year"]].head()

Unnamed: 0,Player,Year
0,A.C. Green,1991
1,Byron Scott,1991
2,Elden Campbell,1991
3,Irving Thomas,1991
4,James Worthy,1991


In [188]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]].values

In [189]:
predictors+= ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,27,Los Angeles Lakers,PF,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,29,Los Angeles Lakers,SG,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,22,Los Angeles Lakers,PF,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,25,Los Angeles Lakers,PF,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,29,Los Angeles Lakers,SF,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [190]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [191]:
mean_ap

0.7199971275432467

In [192]:
stats["Pos"].unique()

array(['PF', 'SG', 'SF', 'PG', 'C'], dtype=object)

In [193]:
# Position Category
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [195]:
stats["Team"].unique()

array(['Los Angeles Lakers', 'Phoenix Suns', 'Dallas Mavericks',
       'Miami Heat', 'Cleveland Cavaliers', 'Washington Bullets',
       'Milwaukee Bucks', 'Chicago Bulls', 'Golden State Warriors',
       'Indiana Pacers', 'Washington Wizards', 'Minnesota Timberwolves',
       'Boston Celtics', 'Atlanta Hawks', 'Houston Rockets',
       'Denver Nuggets', 'Orlando Magic', 'New Orleans Hornets',
       'Toronto Raptors', 'Sacramento Kings', 'Charlotte Hornets',
       'Philadelphia 76ers', 'Portland Trail Blazers', 'Detroit Pistons',
       'Oklahoma City Thunder', 'Utah Jazz', 'Vancouver Grizzlies',
       'Seattle SuperSonics', 'New Jersey Nets',
       'New Orleans/Oklahoma City Hornets', 'Los Angeles Clippers',
       'Charlotte Bobcats', 'Memphis Grizzlies', 'New York Knicks',
       'New Orleans Pelicans', 'Brooklyn Nets', 'San Antonio Spurs'],
      dtype=object)

In [None]:
# Team category
stats["NTeam"] = stats["Team"].astype("category").cat.codes

In [198]:
# Add new categorical metrics
predictors+= ["NPos", "NTeam"]

In [196]:
# Utilizing random forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)

In [197]:
mean_ap

0.734190560397457