In [352]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
# read in csv with all stats and data
data = pd.read_csv("player_mvp_stats.csv")
data

In [None]:
# cleaning
del data["Unnamed: 0"]

# check for null values (should only be for players who attempted 0 fg)
pd.isnull(data).sum()


In [None]:
# replace null values with 0 for simplicity
data = data.fillna(0)

In [39]:
# select which columns should be used as predictors
data.columns
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 
                   'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%','GB', 'PS/G', 'PA/G', 'SRS']
                  
# GOAL = PREDICT %VOTES of SHARE == most likely to win MVP

In [50]:
# training and testing dfs
training_data = data[data["Year"] < 2023]
test_data = data[data["Year"] == 2023]

# initialize model
model = Ridge(alpha = 0.1)


In [54]:
# start predictions
model.fit(training_data[predictors], training_data["Share"])

In [58]:
# run model with predictors
predictions = model.predict(test_data[predictors])
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test_data.index)
predictions

Unnamed: 0,predictions
506,-0.018791
507,-0.022836
508,-0.006528
509,-0.010540
510,0.025746
...,...
15233,-0.013767
15234,-0.004929
15235,-0.020780
15249,0.190310


In [68]:
# add column of predictions
combine = pd.concat([test_data[["Player", "Share"]], predictions], axis = 1)
# see frontrunners 
combine.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
11333,Joel Embiid,0.915,0.199315
15250,Nikola Jokic,0.674,0.16781
8327,Giannis Antetokounmpo,0.606,0.221404
1031,Jayson Tatum,0.28,0.13271
10296,Shai Gilgeous-Alexander,0.046,0.149327
2763,Donovan Mitchell,0.03,0.085517
12843,Domantas Sabonis,0.027,0.089037
15249,Luka Doncic,0.01,0.19031
4855,Stephen Curry,0.005,0.102431
7820,Jimmy Butler,0.003,0.110681


In [78]:
# error metric
# mean_squared_error(combine["Share"], combine["predictions"])
# combine["Share"].value_counts
# mean square error metric invalid as it accounts for all players --> irrelevant in our case

<bound method IndexOpsMixin.value_counts of 506      0.000
507      0.000
508      0.000
509      0.000
510      0.000
         ...  
15233    0.000
15234    0.000
15235    0.000
15249    0.010
15250    0.674
Name: Share, Length: 541, dtype: float64>

In [96]:
# Rank players based on share
combine = combine.sort_values("Share", ascending = False)
combine["Rank"] = list((range(1,combine.shape[0] + 1)))
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted Rank
11333,Joel Embiid,0.915,0.199315,1,2
15250,Nikola Jokic,0.674,0.16781,2,6
8327,Giannis Antetokounmpo,0.606,0.221404,3,1
1031,Jayson Tatum,0.28,0.13271,4,11
10296,Shai Gilgeous-Alexander,0.046,0.149327,5,7
2763,Donovan Mitchell,0.03,0.085517,6,24
12843,Domantas Sabonis,0.027,0.089037,7,22
15249,Luka Doncic,0.01,0.19031,8,3
4855,Stephen Curry,0.005,0.102431,9,16
7820,Jimmy Butler,0.003,0.110681,10,15


In [150]:
# Add predicted rank 
combine = combine.sort_values("predictions", ascending=False)
combine["Predicted Rank"] = list((range(1, combine.shape[0]+1)))
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted Rank
8327,Giannis Antetokounmpo,0.606,0.221404,3,1
11333,Joel Embiid,0.915,0.199315,1,2
15249,Luka Doncic,0.01,0.19031,8,3
15250,Nikola Jokic,0.674,0.16781,2,4
10296,Shai Gilgeous-Alexander,0.046,0.149327,5,5
6913,Anthony Davis,0.0,0.13794,373,6
11839,Kevin Durant,0.0,0.137732,372,7
12332,Damian Lillard,0.0,0.132996,371,8
1031,Jayson Tatum,0.28,0.13271,4,9
6920,LeBron James,0.0,0.128718,370,10


In [160]:
# Implement Error Metric (avg precision)
def find_avg_precision(data):
    actual = combine.sort_values("Share", ascending=False).head(5)
    predicted = combine.sort_values("predictions", ascending=False)
    scores = []
    seen = 1
    found = 0
    for index,row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            scores.append(found / seen)
        seen += 1

    return sum(scores)/len(scores)

In [162]:
find_avg_precision(combine)

0.8211111111111112

In [167]:
years = list(range(1991,2024))

In [179]:
# Backtesting starting from previous years
errors = []
all_predictions = []
# test 5 years previous
for year in years[5:]:
    train = data[data["Year"] < year]
    test = data[data["Year"] == year]
    model.fit(train[predictors], train["Share"])
    predictions = model.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index = test.index)
    combine = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    all_predictions.append(combine)
    # append average precisions
    errors.append(find_avg_precision(combine))
    
    
    

In [181]:
# mean error across all years
sum(errors)/len(errors)

0.7165752582436936

In [187]:
# function to add ranks into df
def add_ranks(combine):
    combine = combine.sort_values("Share", ascending = False)
    combine["Rank"] = list((range(1,combine.shape[0] + 1)))
    combine = combine.sort_values("predictions", ascending=False)
    combine["Predicted Rank"] = list((range(1, combine.shape[0]+1)))
    # diff in predicted rank
    combine["Rank Diff"] = combine["Rank"] - combine["Predicted Rank"]
    return combine

In [197]:
rankings = add_ranks(all_predictions[1])
rankings[rankings["Rank"] < 6].sort_values("Rank Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rank,Predicted Rank,Rank Diff
14194,Karl Malone,0.857,0.19236,1,2,-1
1820,Michael Jordan,0.832,0.167672,2,3,-1
3916,Grant Hill,0.327,0.128664,3,6,-3
7397,Tim Hardaway,0.207,0.059992,4,20,-16
1493,Glen Rice,0.117,0.033122,5,53,-48


In [326]:
# backtesting function using prev code
def backtest(data, model, year, predictors):
    errors = []
    all_predictions = []
    # test 5 years previous
    for year in years[5:]:
        train = data[data["Year"] < year]
        test = data[data["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index = test.index)
        combine = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
        combine = add_ranks(combine)
        all_predictions.append(combine)
        # append average precisions
        errors.append(find_avg_precision(combine))

    return sum(errors)/len(errors), errors, pd.concat(all_predictions)

In [328]:
mean_ap, aps, all_predictions = backtest(data, model, years, predictors)
mean_ap

0.7544011544011543

In [330]:
# check lowest precision for top 5 candidates
all_predictions[all_predictions["Rank"] <= 5].sort_values("Rank Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted Rank,Rank Diff
9018,Jason Kidd,0.712,0.025505,2,60,-58
2073,Joakim Noah,0.258,0.038525,4,46,-42
11555,Steve Nash,0.839,0.036523,1,41,-40
1493,Glen Rice,0.117,0.04192,5,45,-40
11793,Chris Paul,0.138,0.060147,5,36,-31
15237,Peja Stojakovic,0.228,0.040443,4,35,-31
11573,Steve Nash,0.739,0.06262,1,26,-25
7397,Tim Hardaway,0.207,0.05596,4,28,-24
4050,Chauncey Billups,0.344,0.055074,5,29,-24
1809,Scottie Pippen,0.2,0.062497,5,24,-19


In [312]:
# Adding more predictors
# ratio of player stats to league average
ratios = data[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").transform(lambda x: x / x.mean())
ratios

Unnamed: 0,PTS,AST,STL,BLK,3P
0,1.692601,2.010078,2.608773,1.346939,5.594452
1,2.884104,1.542618,2.059558,1.795918,5.085865
2,0.679268,0.327222,0.549215,0.673469,0.000000
3,0.055678,0.000000,0.000000,0.673469,0.000000
4,1.514433,1.262142,0.823823,0.224490,1.017173
...,...,...,...,...,...
15246,2.930330,4.126399,2.097648,1.683977,1.291422
15247,3.419922,4.604499,1.969713,1.618667,3.370410
15248,3.263376,4.181097,2.462142,2.428000,1.413398
15249,3.524501,3.820284,2.286145,1.304872,2.818233


In [336]:
# add to 
data[["PTS_R", "AST_R","STL_R", "BLK_R", "3P_R"]] = ratios[["PTS", "AST","STL", "BLK", "3P"]]

In [338]:
data.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,Doc Rivers,PG,29,ATL,79,79,32.7,5.6,12.9,0.435,...,0.524,18.0,109.8,109.0,0.72,1.692601,2.010078,2.608773,1.346939,5.594452
1,Dominique Wilkins,SF,31,ATL,81,81,38.0,9.5,20.2,0.47,...,0.524,18.0,109.8,109.0,0.72,2.884104,1.542618,2.059558,1.795918,5.085865
2,Duane Ferrell,SF,25,ATL,78,2,14.9,2.2,4.6,0.489,...,0.524,18.0,109.8,109.0,0.72,0.679268,0.327222,0.549215,0.673469,0.0
3,Gary Leonard,C,23,ATL,4,0,2.3,0.0,0.0,0.0,...,0.524,18.0,109.8,109.0,0.72,0.055678,0.0,0.0,0.673469,0.0
4,John Battle,SG,28,ATL,79,2,23.6,5.0,10.9,0.461,...,0.524,18.0,109.8,109.0,0.72,1.514433,1.262142,0.823823,0.22449,1.017173


In [340]:
# add new stats to predictors
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [348]:
mean_ap, errors, all_predictions = backtest(data, model, years, predictors)
mean_ap

0.7544011544011543

In [346]:
# numerical category for position and stats
data["NPos"] = data["Pos"].astype("category").cat.codes
data["NTm"] = data["Tm"].astype("category").cat.codes



In [362]:
# Random Forest Model
rfm = RandomForestRegressor(n_estimators=10, random_state=1, min_samples_split=5)
mean, aps, all_predictions = backtest(data, rfm, years[29:], predictors + ["NPos", "NTm"])


In [364]:
mean

0.7544011544011543

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
0,Doc Rivers,PG,29,ATL,79,79,32.7,5.6,12.9,0.435,...,109.8,109.0,0.72,1.692601,2.010078,2.608773,1.346939,5.594452,5,0
1,Dominique Wilkins,SF,31,ATL,81,81,38.0,9.5,20.2,0.470,...,109.8,109.0,0.72,2.884104,1.542618,2.059558,1.795918,5.085865,8,0
2,Duane Ferrell,SF,25,ATL,78,2,14.9,2.2,4.6,0.489,...,109.8,109.0,0.72,0.679268,0.327222,0.549215,0.673469,0.000000,8,0
3,Gary Leonard,C,23,ATL,4,0,2.3,0.0,0.0,0.000,...,109.8,109.0,0.72,0.055678,0.000000,0.000000,0.673469,0.000000,0,0
4,John Battle,SG,28,ATL,79,2,23.6,5.0,10.9,0.461,...,109.8,109.0,0.72,1.514433,1.262142,0.823823,0.224490,1.017173,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15246,Nikola Jokic,C,25,DEN,72,72,34.6,10.2,18.0,0.566,...,115.1,110.1,4.82,2.930330,4.126399,2.097648,1.683977,1.291422,0,9
15247,Luka Doncic,PG,22,DAL,65,65,35.4,9.9,21.6,0.457,...,108.0,104.7,3.12,3.419922,4.604499,1.969713,1.618667,3.370410,5,8
15248,Nikola Jokic,C,26,DEN,74,74,33.5,10.3,17.7,0.583,...,112.7,110.4,2.16,3.263376,4.181097,2.462142,2.428000,1.413398,0,9
15249,Luka Doncic,PG,23,DAL,66,66,36.2,10.9,22.0,0.496,...,114.2,114.1,-0.14,3.524501,3.820284,2.286145,1.304872,2.818233,5,8
