In [2]:
# Use NBA API to get the data we need

#'leaguedashplayerstats' gets the box score stats for every player in the league
#'playercareerstats' gives specific player stats
#'time' used to wait before sending another request to the NBA servers
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playercareerstats
import time

import pandas as pd
import unicodedata

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import os
from datetime import datetime

from helper_functions import get_train_window, get_season_stats, clean_name, filter_top_50_fantasy_performers

In [3]:
# Here, we define what Seasons we want to use for our model.
# After adding to our 'seasons' list, we call our function get_season_stats()
# for each season and append it to a temp list called 'all_seasons_data'

all_seasons_data = []
seasons = ['2017-18','2018-19','2019-20', '2020-21', '2021-22', '2022-23']
# seasons = ['2020-21', '2021-22', '2022-23']

for year in seasons:
    df = get_season_stats(year)
    all_seasons_data.append(df)
    time.sleep(2)


In [4]:
# Here, we put all the seasons into one Dataframe and will clean it up.
stats_df = pd.concat(all_seasons_data, ignore_index=True)

# Clean up players that are not possible contendors for MVP.
# Players must have played 65 or more games.
print(f"Players before filtering: {len(stats_df)}")
stats_df = stats_df[stats_df['GP'] >= 65]
print(f"Players after filtering: {len(stats_df)}")

# Clean up players that are not in top winning teams.
print(f"Players before filtering: {len(stats_df)}")
stats_df = stats_df[stats_df['W_PCT'] >= 0.600]
print(f"Players after filtering: {len(stats_df)}")


# Clean up players that are not top scorers.
print(f"Players before filtering: {len(stats_df)}")
stats_df = stats_df[stats_df['PTS'] >= 1000]
print(f"Players after filtering: {len(stats_df)}")

# Clean up players that are not top fantasy performers.
print(f"Players before filtering: {len(stats_df)}")
stats_df = filter_top_50_fantasy_performers(stats_df)
print(f"Players after filtering: {len(stats_df)}")

Players before filtering: 3283
Players after filtering: 961
Players before filtering: 961
Players after filtering: 306
Players before filtering: 306
Players after filtering: 109
Players before filtering: 109
Players after filtering: 72


In [5]:
# We clean up our data's characters in their names by stripping out special accents.
# Example: "Nikola Jokić" to "Nikola Jokic"
# This is to ensure that the names will match when we merge with the voting data.
stats_df['PLAYER_NAME'] = stats_df['PLAYER_NAME'].apply(clean_name)
stats_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season
29,203076,Anthony Davis,Anthony,1610612740,NOP,25.0,75,45,30,0.600,...,442,1,3,34,3,6,15,4,1,2017-18
40,1627732,Ben Simmons,Ben,1610612755,PHI,21.0,81,51,30,0.630,...,517,22,36,11,8,11,3,14,1,2017-18
75,203991,Clint Capela,Clint,1610612745,HOU,24.0,74,62,12,0.838,...,487,49,65,13,28,8,32,40,1,2017-18
87,203081,Damian Lillard,Damian,1610612757,POR,27.0,73,44,29,0.603,...,331,8,6,21,11,54,32,7,1,2017-18
112,201942,DeMar DeRozan,DeMar,1610612761,TOR,28.0,80,57,23,0.713,...,425,9,8,8,18,228,32,17,1,2017-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,1627759,Jaylen Brown,Jaylen,1610612738,BOS,26.0,67,46,21,0.687,...,467,39,14,27,24,50,39,17,1,2022-23
2982,1628369,Jayson Tatum,Jayson,1610612738,BOS,25.0,74,52,22,0.703,...,450,7,1,6,4,18,17,1,1,2022-23
2995,203954,Joel Embiid,Joel,1610612755,PHI,29.0,66,43,23,0.652,...,507,2,2,8,3,9,17,3,1,2022-23
3021,201950,Jrue Holiday,Jrue,1610612749,MIL,33.0,67,50,17,0.746,...,352,100,52,5,43,41,39,41,1,2022-23


In [6]:
# Here is a some test code to check if some key players are still in our Dataframe after filtering.
specific_stats = stats_df[stats_df['PLAYER_NAME'] == 'Nikola Jokic']

print(specific_stats)

specific_stats = stats_df[stats_df['PLAYER_NAME'] == 'Stephen Curry']

print(specific_stats)


      PLAYER_ID   PLAYER_NAME NICKNAME     TEAM_ID TEAM_ABBREVIATION   AGE  \
928      203999  Nikola Jokic   Nikola  1610612743               DEN  24.0   
1465     203999  Nikola Jokic   Nikola  1610612743               DEN  25.0   
1998     203999  Nikola Jokic   Nikola  1610612743               DEN  26.0   
2588     203999  Nikola Jokic   Nikola  1610612743               DEN  27.0   
3143     203999  Nikola Jokic   Nikola  1610612743               DEN  28.0   

      GP   W   L  W_PCT  ...  PF_RANK  PFD_RANK  PTS_RANK  PLUS_MINUS_RANK  \
928   80  53  27  0.663  ...      515        10        22               29   
1465  73  46  27  0.630  ...      524        10        18               29   
1998  72  47  25  0.653  ...      529         4         3               10   
2588  74  46  28  0.622  ...      577         3         5                7   
3143  69  48  21  0.696  ...      470        11        17                1   

      NBA_FANTASY_PTS_RANK  DD2_RANK  TD3_RANK  WNBA_FANTASY_P

In [7]:
# Now, we are loading the historical records of who actually got MVP votes.
# These files were pulled from Kaggle --> kagglehub.dataset_download("parthdande/nba-mvp-voting-dataset-2000-2021")
# The files were downloaded using mvp_voting_download.ipynb.

# We load in the files and clean up the names as well so they match up.

files = ["./data/2001-2010MVPData.csv", "./data/2010-2021MVPData.csv", "./data/2022-2023MVPData.csv"]

mvp_list = []
for file in files:
    temp_df = pd.read_csv(file)
    mvp_list.append(temp_df)

total_mvp_df = pd.concat(mvp_list, ignore_index=True)
total_mvp_df['Player'] = total_mvp_df['Player'].apply(clean_name)
total_mvp_df

Unnamed: 0.1,Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year
0,0,1,Allen Iverson,25,PHI,93.0,1121.0,1240,0.904,71,...,3.8,4.6,2.5,0.3,0.420,0.320,0.814,11.8,0.190,2001
1,1,2,Tim Duncan,24,SAS,18.0,706.0,1240,0.569,82,...,12.2,3.0,0.9,2.3,0.499,0.259,0.618,13.2,0.200,2001
2,2,3,Shaquille O'Neal,28,LAL,7.0,578.0,1240,0.466,74,...,12.7,3.7,0.6,2.8,0.572,0.000,0.513,14.9,0.245,2001
3,3,4,Chris Webber,27,SAC,5.0,521.0,1240,0.420,70,...,11.1,4.2,1.3,1.7,0.481,0.071,0.703,11.0,0.186,2001
4,4,5,Kevin Garnett,24,MIN,1.0,151.0,1240,0.122,81,...,11.4,5.0,1.4,1.8,0.477,0.288,0.764,11.8,0.176,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,8,9,Stephen Curry,34,GSW,0.0,5.0,1000,0.005,56,...,6.1,6.3,0.9,0.4,0.493,0.427,0.915,7.8,0.192,2023
334,9,10,Jimmy Butler,33,MIA,0.0,3.0,1000,0.003,64,...,5.9,5.3,1.8,0.3,0.539,0.350,0.850,12.3,0.277,2023
335,10,11,De'Aaron Fox,25,SAC,0.0,2.0,1000,0.002,73,...,4.2,6.1,1.1,0.3,0.512,0.324,0.780,7.4,0.146,2023
336,11,12T,Jalen Brunson,26,NYK,0.0,1.0,1000,0.001,68,...,3.5,6.2,0.9,0.2,0.491,0.416,0.829,8.7,0.175,2023


In [8]:
# The NBA API uses '2022-23' while our CSV files use '2023'
# We need to convert the seasons to match up
def convert_year_to_season(year):
    # Converts 2023 to "2022-23" so the CSV files have a matching year with our NBA API data
    prev_year = year - 1
    # Get the last two digits of the current year (e.g., 23)
    last_two = str(year)[-2:]
    return f"{prev_year}-{last_two}"

total_mvp_df['Season'] = total_mvp_df['year'].apply(convert_year_to_season)
total_mvp_df

Unnamed: 0.1,Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year,Season
0,0,1,Allen Iverson,25,PHI,93.0,1121.0,1240,0.904,71,...,4.6,2.5,0.3,0.420,0.320,0.814,11.8,0.190,2001,2000-01
1,1,2,Tim Duncan,24,SAS,18.0,706.0,1240,0.569,82,...,3.0,0.9,2.3,0.499,0.259,0.618,13.2,0.200,2001,2000-01
2,2,3,Shaquille O'Neal,28,LAL,7.0,578.0,1240,0.466,74,...,3.7,0.6,2.8,0.572,0.000,0.513,14.9,0.245,2001,2000-01
3,3,4,Chris Webber,27,SAC,5.0,521.0,1240,0.420,70,...,4.2,1.3,1.7,0.481,0.071,0.703,11.0,0.186,2001,2000-01
4,4,5,Kevin Garnett,24,MIN,1.0,151.0,1240,0.122,81,...,5.0,1.4,1.8,0.477,0.288,0.764,11.8,0.176,2001,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,8,9,Stephen Curry,34,GSW,0.0,5.0,1000,0.005,56,...,6.3,0.9,0.4,0.493,0.427,0.915,7.8,0.192,2023,2022-23
334,9,10,Jimmy Butler,33,MIA,0.0,3.0,1000,0.003,64,...,5.3,1.8,0.3,0.539,0.350,0.850,12.3,0.277,2023,2022-23
335,10,11,De'Aaron Fox,25,SAC,0.0,2.0,1000,0.002,73,...,6.1,1.1,0.3,0.512,0.324,0.780,7.4,0.146,2023,2022-23
336,11,12T,Jalen Brunson,26,NYK,0.0,1.0,1000,0.001,68,...,6.2,0.9,0.2,0.491,0.416,0.829,8.7,0.175,2023,2022-23


In [9]:
# We are performing a "Left Join."
# We keep all player stats from 'stats_df' and attach the MVP 'Share' column where a match is found.
# If a player didn't receive any votes, the share will be 'NaN' (empty).
# We fill those empties with 0.0 because it's important for the model to know what a non-MVP looks like.
mvp_subset = total_mvp_df[['Player', 'Season', 'Share']]

merged_df = pd.merge(
    stats_df, 
    mvp_subset, 
    left_on=['PLAYER_NAME', 'Season'], 
    right_on=['Player', 'Season'], 
    how='left'
)

merged_df['Share'] = merged_df['Share'].fillna(0)


In [10]:
# Block of code to open data in Data Wrangler extension
stats_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season
29,203076,Anthony Davis,Anthony,1610612740,NOP,25.0,75,45,30,0.600,...,442,1,3,34,3,6,15,4,1,2017-18
40,1627732,Ben Simmons,Ben,1610612755,PHI,21.0,81,51,30,0.630,...,517,22,36,11,8,11,3,14,1,2017-18
75,203991,Clint Capela,Clint,1610612745,HOU,24.0,74,62,12,0.838,...,487,49,65,13,28,8,32,40,1,2017-18
87,203081,Damian Lillard,Damian,1610612757,POR,27.0,73,44,29,0.603,...,331,8,6,21,11,54,32,7,1,2017-18
112,201942,DeMar DeRozan,DeMar,1610612761,TOR,28.0,80,57,23,0.713,...,425,9,8,8,18,228,32,17,1,2017-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,1627759,Jaylen Brown,Jaylen,1610612738,BOS,26.0,67,46,21,0.687,...,467,39,14,27,24,50,39,17,1,2022-23
2982,1628369,Jayson Tatum,Jayson,1610612738,BOS,25.0,74,52,22,0.703,...,450,7,1,6,4,18,17,1,1,2022-23
2995,203954,Joel Embiid,Joel,1610612755,PHI,29.0,66,43,23,0.652,...,507,2,2,8,3,9,17,3,1,2022-23
3021,201950,Jrue Holiday,Jrue,1610612749,MIL,33.0,67,50,17,0.746,...,352,100,52,5,43,41,39,41,1,2022-23


In [11]:
# Block of code to open data in Data Wrangler extension
mvp_subset

Unnamed: 0,Player,Season,Share
0,Allen Iverson,2000-01,0.904
1,Tim Duncan,2000-01,0.569
2,Shaquille O'Neal,2000-01,0.466
3,Chris Webber,2000-01,0.420
4,Kevin Garnett,2000-01,0.122
...,...,...,...
333,Stephen Curry,2022-23,0.005
334,Jimmy Butler,2022-23,0.003
335,De'Aaron Fox,2022-23,0.002
336,Jalen Brunson,2022-23,0.001


In [12]:
# Block of code to open data in Data Wrangler extension
merged_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season,Player,Share
0,203076,Anthony Davis,Anthony,1610612740,NOP,25.0,75,45,30,0.600,...,3,34,3,6,15,4,1,2017-18,Anthony Davis,0.441
1,1627732,Ben Simmons,Ben,1610612755,PHI,21.0,81,51,30,0.630,...,36,11,8,11,3,14,1,2017-18,,0.000
2,203991,Clint Capela,Clint,1610612745,HOU,24.0,74,62,12,0.838,...,65,13,28,8,32,40,1,2017-18,,0.000
3,203081,Damian Lillard,Damian,1610612757,POR,27.0,73,44,29,0.603,...,6,21,11,54,32,7,1,2017-18,Damian Lillard,0.205
4,201942,DeMar DeRozan,DeMar,1610612761,TOR,28.0,80,57,23,0.713,...,8,8,18,228,32,17,1,2017-18,DeMar DeRozan,0.032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1627759,Jaylen Brown,Jaylen,1610612738,BOS,26.0,67,46,21,0.687,...,14,27,24,50,39,17,1,2022-23,,0.000
68,1628369,Jayson Tatum,Jayson,1610612738,BOS,25.0,74,52,22,0.703,...,1,6,4,18,17,1,1,2022-23,Jayson Tatum,0.280
69,203954,Joel Embiid,Joel,1610612755,PHI,29.0,66,43,23,0.652,...,2,8,3,9,17,3,1,2022-23,Joel Embiid,0.915
70,201950,Jrue Holiday,Jrue,1610612749,MIL,33.0,67,50,17,0.746,...,52,5,43,41,39,41,1,2022-23,,0.000


In [67]:
# We define our inputs (X) and our desired output (y).
# 'features' are the specific stats we think influence voters (The "Features").
# 'Share' is the target we want the model to learn to predict (The "Target").

# 1. Select the features we want the model to learn from
features = ['TD3_RANK', 'NBA_FANTASY_PTS_RANK', 'WNBA_FANTASY_PTS_RANK']

# 2. Define our X (the stats) and our y (the answer we want to predict)
X = merged_df[features]
y = merged_df['Share']

In [None]:
# We use historical data (everything EXCEPT 2022-23) to "teach" the model.
# Then we will ask it to "guess" the results for the 2022-23 season.
# LinearRegression finds the mathematical weights for each stat that best predict the MVP share.
test_season = '2022-23'
train_window_seasons = get_train_window(seasons.index(test_season), seasons)
print(train_window_seasons)
# Split data by season
train = merged_df[merged_df['Season'].isin(train_window_seasons)]
test = merged_df[merged_df['Season'] == test_season]

# Initialize and Train the Model
model = LinearRegression()
model.fit(train[features], train['Share'])

['2018-19', '2019-20']


0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [69]:
train

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season,Player,Share
13,1627732,Ben Simmons,Ben,1610612755,PHI,22.0,79,50,29,0.633,...,46,89,15,10,3,18,1,2018-19,,0.0
14,203991,Clint Capela,Clint,1610612745,HOU,25.0,67,44,23,0.657,...,66,45,38,9,38,47,1,2018-19,,0.0
15,203081,Damian Lillard,Damian,1610612757,POR,28.0,80,51,29,0.638,...,5,11,11,58,38,8,1,2018-19,Damian Lillard,0.068
16,1628378,Donovan Mitchell,Donovan,1610612762,UTA,22.0,77,49,28,0.636,...,11,20,25,189,38,20,1,2018-19,,0.0
17,202339,Eric Bledsoe,Eric,1610612749,MIL,29.0,78,59,19,0.756,...,54,7,42,82,16,41,1,2018-19,,0.0
18,203507,Giannis Antetokounmpo,Giannis,1610612749,MIL,24.0,72,56,16,0.778,...,7,3,2,7,8,3,1,2018-19,Giannis Antetokounmpo,0.932
19,1627750,Jamal Murray,Jamal,1610612743,DEN,22.0,75,49,26,0.653,...,41,41,48,111,38,43,1,2018-19,,0.0
20,201935,James Harden,James,1610612745,HOU,29.0,78,51,27,0.654,...,1,22,1,16,6,1,1,2018-19,James Harden,0.768
21,203994,Jusuf Nurkic,Jusuf,1610612757,POR,24.0,72,45,27,0.625,...,65,13,31,14,16,39,1,2018-19,,0.0
22,201142,Kevin Durant,Kevin,1610612744,GSW,30.0,78,54,24,0.692,...,6,2,10,45,10,10,1,2018-19,Kevin Durant,0.025


In [70]:
# 1. Generate the predictions
predictions = model.predict(test[features])
test['Predicted_Share'] = predictions

# 2. Calculate the ACTUAL Rank 
# We sort by the real 'Share' and assign numbers 1, 2, 3...
test = test.sort_values('Share', ascending=False)
test['Actual_Rank'] = range(1, len(test) + 1)

# 3. Calculate the PREDICTED Rank
# We sort by our model's 'Predicted_Share' and assign numbers 1, 2, 3...
test = test.sort_values('Predicted_Share', ascending=False)
test['Predicted_Rank'] = range(1, len(test) + 1)

# 4. Compare
# top 10 candidates our model identified versus their actual rank
print("--- MVP Prediction vs Reality (2022-23 Season) ---")
display_cols = ['PLAYER_NAME', 'Share', 'Predicted_Share', 'Actual_Rank', 'Predicted_Rank']
print(test[display_cols].head(10))

--- MVP Prediction vs Reality (2022-23 Season) ---
        PLAYER_NAME  Share  Predicted_Share  Actual_Rank  Predicted_Rank
44     Nikola Jokic  0.961         0.349955            1               1
43      Luka Doncic  0.042         0.337808            3               2
41     Devin Booker  0.000         0.174814            5               3
40       Chris Paul  0.138         0.126604            2               4
42  Khris Middleton  0.000         0.122232            6               5
45      Rudy Gobert  0.008         0.119838            4               6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_Share'] = predictions


In [71]:
# 1. Calculate the core Regression Metrics
# We compare the 'Rank' (the Truth) against the 'Rank' (our Model's Guess)
mae = mean_absolute_error(test['Actual_Rank'], test['Predicted_Rank'])
mse = mean_squared_error(test['Actual_Rank'], test['Predicted_Rank'])
rmse = np.sqrt(mse) # RMSE is useful because it is in the same units as our target variable
r2 = r2_score(test['Actual_Rank'], test['Predicted_Rank'])

# 2. Define the name of our log file
# This file will live in your current project directory
log_file = "model_accuracy_log.txt"

# 3. Create a Timestamped Report String
# Adding the date and time is crucial for tracking which version of your model performed best
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

report_entry = f"""
=========================================
NBA MVP MODEL REPORT: {timestamp}
=========================================
Model Architecture: Linear Regression
Features Used: {features}
Test Season: {test_season}
Training Window; {train_window_seasons}
All Seasons: {seasons}
Other Comments: 
-----------------------------------------
Mean Absolute Error (MAE): {mae:.4f}
Mean Squared Error (MSE):  {mse:.4f}
Root Mean Squared Error (RMSE): {rmse:.4f}
R-squared Score (R2): {r2:.4f}
=========================================\n\n"""

# 4. Open the file in 'Append' mode ('a')
# This ensures we don't overwrite previous tests. We want a history of our progress!
with open(log_file, "a") as f:
    f.write(report_entry)

# 5. Provide immediate feedback to the console
print(f"Success! Model metrics calculated and appended to {log_file}.")
print(f"Current R2 Score: {r2:.4f}")

Success! Model metrics calculated and appended to model_accuracy_log.txt.
Current R2 Score: 0.2000
