In [1]:
# Use NBA API to get the data we need

#'leaguedashplayerstats' gets the box score stats for every player in the league
#'playercareerstats' gives specific player stats
#'time' used to wait before sending another request to the NBA servers
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playercareerstats
import time

In [2]:
# This function gets the season stats based on the season string it is given.
# returns a clean table (Dataframe)
# 'Season' column is added to the table so we don't forget which year the stats belong to

def get_season_stats(curr_season):
    stats = leaguedashplayerstats.LeagueDashPlayerStats(season=curr_season)
    df = stats.get_data_frames()[0]
    df['Season'] = curr_season
    return df

In [3]:
# Here, we define what Seasons we want to use for our model.
# After adding to our 'seasons' list, we call our function get_season_stats()
# for each season and append it to a temp list called 'all_seasons_data'

all_seasons_data = []
seasons = ['2020-21', '2021-22', '2022-23']

for year in seasons:
    df = get_season_stats(year)
    all_seasons_data.append(df)
    time.sleep(2)

In [5]:
# We clean up our data's characters in their names by stripping out special accents.
# Example: "Nikola Jokić" to "Nikola Jokic"
# This is to ensure that the names will match when we merge with the voting data.
import pandas as pd
import unicodedata

stats_df = pd.concat(all_seasons_data, ignore_index=True)

def clean_name(name):
    return ''.join(c for c in unicodedata.normalize('NFD', name)
                  if unicodedata.category(c) != 'Mn')

stats_df['PLAYER_NAME'] = stats_df['PLAYER_NAME'].apply(clean_name)
stats_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season
0,203932,Aaron Gordon,Aaron,1610612743,DEN,25.0,50,29,21,0.580,...,314,95,156,124,150,115,17,152,2,2020-21
1,1628988,Aaron Holiday,Aaron,1610612754,IND,24.0,66,30,36,0.455,...,325,201,203,218,244,178,29,232,1,2020-21
2,1630174,Aaron Nesmith,Aaron,1610612738,BOS,21.0,46,22,24,0.478,...,301,354,343,258,347,245,29,344,1,2020-21
3,1627846,Abdel Nader,Abdel,1610612756,PHX,27.0,24,16,8,0.667,...,152,328,367,162,380,245,29,377,1,2020-21
4,1629690,Adam Mokoka,Adam,1610612741,CHI,22.0,14,3,11,0.214,...,34,519,500,262,510,245,29,509,1,2020-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,1628380,Zach Collins,Zach,1610612759,SAS,25.0,63,20,43,0.317,...,503,114,144,516,141,70,39,140,1,2022-23
1680,203897,Zach LaVine,Zach,1610612741,CHI,28.0,77,38,39,0.494,...,444,30,10,182,19,155,39,14,1,2022-23
1681,1630192,Zeke Nnaji,Zeke,1610612743,DEN,22.0,53,34,19,0.642,...,319,257,311,440,330,192,39,331,1,2022-23
1682,1630533,Ziaire Williams,Ziaire,1610612763,MEM,21.0,37,21,16,0.568,...,190,339,346,414,370,253,39,366,1,2022-23


In [6]:
# Clean up players that are not possible contendors for MVP.
# Players must have played 65 or more games.
print(f"Players before filtering: {len(stats_df)}")
stats_df = stats_df[stats_df['GP'] >= 65]
print(f"Players after filtering: {len(stats_df)}")

# Clean up players that are not in top winning teams.
print(f"Players before filtering: {len(stats_df)}")
stats_df = stats_df[stats_df['W_PCT'] >= 0.600]
print(f"Players after filtering: {len(stats_df)}")

Players before filtering: 1684
Players after filtering: 443
Players before filtering: 443
Players after filtering: 138


In [7]:
# show certain players stats
specific_stats = stats_df[stats_df['PLAYER_NAME'] == 'Nikola Jokic']

print(specific_stats)


      PLAYER_ID   PLAYER_NAME NICKNAME     TEAM_ID TEAM_ABBREVIATION   AGE  \
399      203999  Nikola Jokic   Nikola  1610612743               DEN  26.0   
989      203999  Nikola Jokic   Nikola  1610612743               DEN  27.0   
1544     203999  Nikola Jokic   Nikola  1610612743               DEN  28.0   

      GP   W   L  W_PCT  ...  PF_RANK  PFD_RANK  PTS_RANK  PLUS_MINUS_RANK  \
399   72  47  25  0.653  ...      529         4         3               10   
989   74  46  28  0.622  ...      577         3         5                7   
1544  69  48  21  0.696  ...      470        11        17                1   

      NBA_FANTASY_PTS_RANK  DD2_RANK  TD3_RANK  WNBA_FANTASY_PTS_RANK  \
399                      1         1         2                      1   
989                      1         1         1                      1   
1544                     1         2         1                      4   

      TEAM_COUNT   Season  
399            1  2020-21  
989            1  2021-22

In [8]:
specific_stats = stats_df[stats_df['PLAYER_NAME'] == 'Stephen Curry']

print(specific_stats)

Empty DataFrame
Columns: [PLAYER_ID, PLAYER_NAME, NICKNAME, TEAM_ID, TEAM_ABBREVIATION, AGE, GP, W, L, W_PCT, MIN, FGM, FGA, FG_PCT, FG3M, FG3A, FG3_PCT, FTM, FTA, FT_PCT, OREB, DREB, REB, AST, TOV, STL, BLK, BLKA, PF, PFD, PTS, PLUS_MINUS, NBA_FANTASY_PTS, DD2, TD3, WNBA_FANTASY_PTS, GP_RANK, W_RANK, L_RANK, W_PCT_RANK, MIN_RANK, FGM_RANK, FGA_RANK, FG_PCT_RANK, FG3M_RANK, FG3A_RANK, FG3_PCT_RANK, FTM_RANK, FTA_RANK, FT_PCT_RANK, OREB_RANK, DREB_RANK, REB_RANK, AST_RANK, TOV_RANK, STL_RANK, BLK_RANK, BLKA_RANK, PF_RANK, PFD_RANK, PTS_RANK, PLUS_MINUS_RANK, NBA_FANTASY_PTS_RANK, DD2_RANK, TD3_RANK, WNBA_FANTASY_PTS_RANK, TEAM_COUNT, Season]
Index: []

[0 rows x 68 columns]


In [9]:
# Now, we are loading the historical records of who actually got MVP votes.
# These files were pulled from Kaggle --> kagglehub.dataset_download("parthdande/nba-mvp-voting-dataset-2000-2021")
# The files were downloaded using mvp_voting_download.ipynb.

# We load in the files and clean up the names as well so they match up.

files = ["./data/2001-2010MVPData.csv", "./data/2010-2021MVPData.csv", "./data/2022-2023MVPData.csv"]

mvp_list = []
for file in files:
    temp_df = pd.read_csv(file)
    mvp_list.append(temp_df)

total_mvp_df = pd.concat(mvp_list, ignore_index=True)
total_mvp_df['Player'] = total_mvp_df['Player'].apply(clean_name)
total_mvp_df

Unnamed: 0.1,Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year
0,0,1,Allen Iverson,25,PHI,93.0,1121.0,1240,0.904,71,...,3.8,4.6,2.5,0.3,0.420,0.320,0.814,11.8,0.190,2001
1,1,2,Tim Duncan,24,SAS,18.0,706.0,1240,0.569,82,...,12.2,3.0,0.9,2.3,0.499,0.259,0.618,13.2,0.200,2001
2,2,3,Shaquille O'Neal,28,LAL,7.0,578.0,1240,0.466,74,...,12.7,3.7,0.6,2.8,0.572,0.000,0.513,14.9,0.245,2001
3,3,4,Chris Webber,27,SAC,5.0,521.0,1240,0.420,70,...,11.1,4.2,1.3,1.7,0.481,0.071,0.703,11.0,0.186,2001
4,4,5,Kevin Garnett,24,MIN,1.0,151.0,1240,0.122,81,...,11.4,5.0,1.4,1.8,0.477,0.288,0.764,11.8,0.176,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,8,9,Stephen Curry,34,GSW,0.0,5.0,1000,0.005,56,...,6.1,6.3,0.9,0.4,0.493,0.427,0.915,7.8,0.192,2023
334,9,10,Jimmy Butler,33,MIA,0.0,3.0,1000,0.003,64,...,5.9,5.3,1.8,0.3,0.539,0.350,0.850,12.3,0.277,2023
335,10,11,De'Aaron Fox,25,SAC,0.0,2.0,1000,0.002,73,...,4.2,6.1,1.1,0.3,0.512,0.324,0.780,7.4,0.146,2023
336,11,12T,Jalen Brunson,26,NYK,0.0,1.0,1000,0.001,68,...,3.5,6.2,0.9,0.2,0.491,0.416,0.829,8.7,0.175,2023


In [10]:
# The NBA API uses '2022-23' while our CSV files use '2023'
# We need to convert the seasons to match up
def convert_year_to_season(year):
    # Converts 2023 to "2022-23"
    prev_year = year - 1
    # Get the last two digits of the current year (e.g., 23)
    last_two = str(year)[-2:]
    return f"{prev_year}-{last_two}"

total_mvp_df['Season'] = total_mvp_df['year'].apply(convert_year_to_season)
total_mvp_df

Unnamed: 0.1,Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,...,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year,Season
0,0,1,Allen Iverson,25,PHI,93.0,1121.0,1240,0.904,71,...,4.6,2.5,0.3,0.420,0.320,0.814,11.8,0.190,2001,2000-01
1,1,2,Tim Duncan,24,SAS,18.0,706.0,1240,0.569,82,...,3.0,0.9,2.3,0.499,0.259,0.618,13.2,0.200,2001,2000-01
2,2,3,Shaquille O'Neal,28,LAL,7.0,578.0,1240,0.466,74,...,3.7,0.6,2.8,0.572,0.000,0.513,14.9,0.245,2001,2000-01
3,3,4,Chris Webber,27,SAC,5.0,521.0,1240,0.420,70,...,4.2,1.3,1.7,0.481,0.071,0.703,11.0,0.186,2001,2000-01
4,4,5,Kevin Garnett,24,MIN,1.0,151.0,1240,0.122,81,...,5.0,1.4,1.8,0.477,0.288,0.764,11.8,0.176,2001,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,8,9,Stephen Curry,34,GSW,0.0,5.0,1000,0.005,56,...,6.3,0.9,0.4,0.493,0.427,0.915,7.8,0.192,2023,2022-23
334,9,10,Jimmy Butler,33,MIA,0.0,3.0,1000,0.003,64,...,5.3,1.8,0.3,0.539,0.350,0.850,12.3,0.277,2023,2022-23
335,10,11,De'Aaron Fox,25,SAC,0.0,2.0,1000,0.002,73,...,6.1,1.1,0.3,0.512,0.324,0.780,7.4,0.146,2023,2022-23
336,11,12T,Jalen Brunson,26,NYK,0.0,1.0,1000,0.001,68,...,6.2,0.9,0.2,0.491,0.416,0.829,8.7,0.175,2023,2022-23


In [11]:
# We are performing a "Left Join."
# We keep all player stats from 'stats_df' and attach the MVP 'Share' column where a match is found.
# If a player didn't receive any votes, the share will be 'NaN' (empty).
# We fill those empties with 0.0 because it's important for the model to know what a non-MVP looks like.
mvp_subset = total_mvp_df[['Player', 'Season', 'Share']]

merged_df = pd.merge(
    stats_df, 
    mvp_subset, 
    left_on=['PLAYER_NAME', 'Season'], 
    right_on=['Player', 'Season'], 
    how='left'
)

merged_df['Share'] = merged_df['Share'].fillna(0)


In [68]:
stats_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season
1,1628988,Aaron Holiday,Aaron,1610612754,IND,24.0,66,30,36,0.455,...,325,201,203,218,244,178,29,232,1,2020-21
11,203458,Alex Len,Alex,1610612764,WAS,28.0,64,30,34,0.469,...,442,198,224,445,208,146,29,222,2,2020-21
18,2738,Andre Iguodala,Andre,1610612748,MIA,37.0,63,37,26,0.587,...,318,264,307,424,218,245,29,229,1,2020-21
20,203952,Andrew Wiggins,Andrew,1610612744,GSW,26.0,71,38,33,0.535,...,492,38,25,230,36,115,29,29,1,2020-21
21,1629014,Anfernee Simons,Anfernee,1610612757,POR,22.0,64,37,27,0.578,...,333,312,191,139,255,245,29,223,1,2020-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665,1628427,Vlatko Cancar,Vlatko,1610612743,DEN,26.0,60,38,22,0.633,...,265,299,297,400,311,192,39,305,1,2022-23
1667,1631117,Walker Kessler,Walker,1610612762,UTA,21.0,74,34,40,0.459,...,462,141,165,120,86,30,39,118,1,2022-23
1670,1629117,Wenyen Gabriel,Wenyen,1610612747,LAL,26.0,68,37,31,0.544,...,412,199,271,343,258,253,39,263,1,2022-23
1677,1630214,Xavier Tillman,Xavier,1610612763,MEM,24.0,61,40,21,0.656,...,290,236,251,237,208,118,39,233,1,2022-23


In [29]:
mvp_subset

Unnamed: 0,Player,Season,Share
0,Allen Iverson,2000-01,0.904
1,Tim Duncan,2000-01,0.569
2,Shaquille O'Neal,2000-01,0.466
3,Chris Webber,2000-01,0.420
4,Kevin Garnett,2000-01,0.122
...,...,...,...
333,Stephen Curry,2022-23,0.005
334,Jimmy Butler,2022-23,0.003
335,De'Aaron Fox,2022-23,0.002
336,Jalen Brunson,2022-23,0.001


In [30]:
merged_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK,TEAM_COUNT,Season,Player,Share
0,1628988,Aaron Holiday,Aaron,1610612754,IND,24.0,66,30,36,0.455,...,203,218,244,178,29,232,1,2020-21,,0.0
1,203952,Andrew Wiggins,Andrew,1610612744,GSW,26.0,71,38,33,0.535,...,25,230,36,115,29,29,1,2020-21,,0.0
2,1630162,Anthony Edwards,Anthony,1610612750,MIN,19.0,72,23,49,0.319,...,23,518,31,115,29,24,1,2020-21,,0.0
3,202687,Bismack Biyombo,Bismack,1610612766,CHA,28.0,66,28,38,0.424,...,279,525,197,115,29,215,1,2020-21,,0.0
4,1626171,Bobby Portis,Bobby,1610612749,MIL,26.0,66,44,22,0.667,...,114,70,100,44,29,104,1,2020-21,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,1626145,Tyus Jones,Tyus,1610612763,MEM,27.0,80,51,29,0.638,...,122,104,105,70,17,121,1,2022-23,,0.0
439,1630586,Usman Garuba,Usman,1610612745,HOU,21.0,75,21,54,0.280,...,339,518,265,253,39,274,1,2022-23,,0.0
440,1631117,Walker Kessler,Walker,1610612762,UTA,21.0,74,34,40,0.459,...,165,120,86,30,39,118,1,2022-23,,0.0
441,1629117,Wenyen Gabriel,Wenyen,1610612747,LAL,26.0,68,37,31,0.544,...,271,343,258,253,39,263,1,2022-23,,0.0


In [15]:
# We define our inputs (X) and our desired output (y).
# 'features' are the specific stats we think influence voters (The "Features").
# 'Share' is the target we want the model to learn to predict (The "Target").

# 1. Select the features we want the model to learn from
features = ['DD2_RANK', 'TD3_RANK', 'W_PCT', 'PLUS_MINUS', 'NBA_FANTASY_PTS', 'FGM_RANK', 'WNBA_FANTASY_PTS']

# 2. Define our X (the stats) and our y (the answer we want to predict)
X = merged_df[features]
y = merged_df['Share']

In [16]:
# We use historical data (everything EXCEPT 2022-23) to "teach" the model.
# Then we will ask it to "guess" the results for the 2022-23 season.
# LinearRegression finds the mathematical weights for each stat that best predict the MVP share.

from sklearn.linear_model import LinearRegression

# Split data by season
train = merged_df[merged_df['Season'] != '2022-23']
test = merged_df[merged_df['Season'] == '2022-23']

# Initialize and Train the Model
model = LinearRegression()
model.fit(train[features], train['Share'])

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [17]:
# 1. Generate the predictions
predictions = model.predict(test[features])
test['Predicted_Share'] = predictions

# 2. Calculate the ACTUAL Rank 
# We sort by the real 'Share' and assign numbers 1, 2, 3...
test = test.sort_values('Share', ascending=False)
test['Actual_Rank'] = range(1, len(test) + 1)

# 3. Calculate the PREDICTED Rank
# We sort by our model's 'Predicted_Share' and assign numbers 1, 2, 3...
test = test.sort_values('Predicted_Share', ascending=False)
test['Predicted_Rank'] = range(1, len(test) + 1)

# 4. Compare
# top 10 candidates our model identified versus their actual rank
print("--- MVP Prediction vs Reality (2022-23 Season) ---")
display_cols = ['PLAYER_NAME', 'Share', 'Predicted_Share', 'Actual_Rank', 'Predicted_Rank']
print(test[display_cols].head(10))

--- MVP Prediction vs Reality (2022-23 Season) ---
           PLAYER_NAME  Share  Predicted_Share  Actual_Rank  Predicted_Rank
130       Nikola Jokic  0.674         0.441694            2               1
123        Joel Embiid  0.915         0.339946            1               2
121       Jayson Tatum  0.280         0.313579            3               3
102        Bruce Brown  0.000         0.150414           11               4
108       De'Aaron Fox  0.002         0.139593            5               5
113        Evan Mobley  0.000         0.118528           16               6
118       Jamal Murray  0.000         0.113797           21               7
109  De'Anthony Melton  0.000         0.107301           14               8
112   Donovan Mitchell  0.030         0.104159            4               9
137         Tyus Jones  0.000         0.100020           39              10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_Share'] = predictions


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import os
from datetime import datetime

# 1. Calculate the core Regression Metrics
# We compare the 'Share' (the Truth) against the 'Predicted_Share' (our Model's Guess)
mae = mean_absolute_error(test['Share'], test['Predicted_Share'])
mse = mean_squared_error(test['Share'], test['Predicted_Share'])
rmse = np.sqrt(mse) # RMSE is useful because it is in the same units as our target variable
r2 = r2_score(test['Share'], test['Predicted_Share'])

# 2. Define the name of our log file
# This file will live in your current project directory
log_file = "model_accuracy_log.txt"

# 3. Create a Timestamped Report String
# Adding the date and time is crucial for tracking which version of your model performed best
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

report_entry = f"""
=========================================
NBA MVP MODEL REPORT: {timestamp}
=========================================
Model Architecture: Linear Regression
Features Used: {features}
Other Comments: 
-----------------------------------------
Mean Absolute Error (MAE): {mae:.4f}
Mean Squared Error (MSE):  {mse:.4f}
Root Mean Squared Error (RMSE): {rmse:.4f}
R-squared Score (R2): {r2:.4f}
=========================================\n\n"""

# 4. Open the file in 'Append' mode ('a')
# This ensures we don't overwrite previous tests. We want a history of our progress!
with open(log_file, "a") as f:
    f.write(report_entry)

# 5. Provide immediate feedback to the console
print(f"Success! Model metrics calculated and appended to {log_file}.")
print(f"Current R2 Score: {r2:.4f}")

Success! Model metrics calculated and appended to model_accuracy_log.txt.
Current R2 Score: 0.5936
