In [None]:
# Use NBA API to get the data we need

#'leaguedashplayerstats' gets the box score stats for every player in the league
#'playercareerstats' gives specific player stats
#'time' used to wait before sending another request to the NBA servers
from nba_api.stats.endpoints import leaguedashplayerstats
from nba_api.stats.endpoints import playercareerstats
import time

In [None]:
# This function gets the season stats based on the season string it is given.
# returns a clean table (Dataframe)
# 'Season' column is added to the table so we don't forget which year the stats belong to

def get_season_stats(curr_season):
    stats = leaguedashplayerstats.LeagueDashPlayerStats(season=curr_season)
    df = stats.get_data_frames()[0]
    df['Season'] = curr_season
    return df

In [None]:
# Here, we define what Seasons we want to use for our model.
# After adding to our 'seasons' list, we call our function get_season_stats()
# for each season and append it to a temp list called 'all_seasons_data'

all_seasons_data = []
seasons = ['2020-21', '2021-22', '2022-23']

for year in seasons:
    df = get_season_stats(year)
    all_seasons_data.append(df)
    time.sleep(2)

In [None]:
# testing

print(all_seasons_data[0]["NICKNAME"])

0      Aaron
1      Aaron
2      Aaron
3      Abdel
4       Adam
       ...  
535     Yogi
536     Yuta
537     Zach
538     Zeke
539     Zion
Name: NICKNAME, Length: 540, dtype: object


In [None]:
# We clean up our data's characters in their names by stripping out special accents.
# Example: "Nikola Jokić" to "Nikola Jokic"
# This is to ensure that the names will match when we merge with the voting data.
import pandas as pd
import unicodedata

stats_df = pd.concat(all_seasons_data, ignore_index=True)

def clean_name(name):
    return ''.join(c for c in unicodedata.normalize('NFD', name)
                  if unicodedata.category(c) != 'Mn')

stats_df['PLAYER_NAME'] = stats_df['PLAYER_NAME'].apply(clean_name)

In [None]:
# Now, we are loading the historical records of who actually got MVP votes.
# These files were pulled from Kaggle --> kagglehub.dataset_download("parthdande/nba-mvp-voting-dataset-2000-2021")
# The files were downloaded using mvp_voting_download.ipynb.

# We load in the files and clean up the names as well so they match up.

files = ["./data/2001-2010MVPData.csv", "./data/2010-2021MVPData.csv", "./data/2022-2023MVPData.csv"]

mvp_list = []
for file in files:
    temp_df = pd.read_csv(file)
    mvp_list.append(temp_df)

total_mvp_df = pd.concat(mvp_list, ignore_index=True)
total_mvp_df['Player'] = total_mvp_df['Player'].apply(clean_name)

In [None]:
# The NBA API uses '2022-23' while our CSV files use '2023'
# We need to convert the seasons to match up
def convert_year_to_season(year):
    # Converts 2023 to "2022-23"
    prev_year = year - 1
    # Get the last two digits of the current year (e.g., 23)
    last_two = str(year)[-2:]
    return f"{prev_year}-{last_two}"

total_mvp_df['Season'] = total_mvp_df['year'].apply(convert_year_to_season)

In [None]:
# We are performing a "Left Join."
# We keep all player stats from 'stats_df' and attach the MVP 'Share' column where a match is found.
# If a player didn't receive any votes, the share will be 'NaN' (empty).
# We fill those empties with 0.0 because it's important for the model to know what a non-MVP looks like.
mvp_subset = total_mvp_df[['Player', 'Season', 'Share']]

merged_df = pd.merge(
    stats_df, 
    mvp_subset, 
    left_on=['PLAYER_NAME', 'Season'], 
    right_on=['Player', 'Season'], 
    how='left'
)

merged_df['Share'] = merged_df['Share'].fillna(0)

In [18]:
# We define our inputs (X) and our desired output (y).
# 'features' are the specific stats we think influence voters (The "Features").
# 'Share' is the target we want the model to learn to predict (The "Target").

# 1. Select the features we want the model to learn from
features = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'FG_PCT', 'W_PCT']

# 2. Define our X (the stats) and our y (the answer we want to predict)
X = merged_df[features]
y = merged_df['Share']

In [19]:
# We use historical data (everything EXCEPT 2022-23) to "teach" the model.
# Then we will ask it to "guess" the results for the 2022-23 season.
# LinearRegression finds the mathematical weights for each stat that best predict the MVP share.

from sklearn.linear_model import LinearRegression

# Split data by season
train = merged_df[merged_df['Season'] != '2022-23']
test = merged_df[merged_df['Season'] == '2022-23']

# Initialize and Train the Model
model = LinearRegression()
model.fit(train[features], train['Share'])

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [20]:
# 1. Generate the predictions
predictions = model.predict(test[features])
test['Predicted_Share'] = predictions

# 2. Calculate the ACTUAL Rank 
# We sort by the real 'Share' and assign numbers 1, 2, 3...
test = test.sort_values('Share', ascending=False)
test['Actual_Rank'] = range(1, len(test) + 1)

# 3. Calculate the PREDICTED Rank
# We sort by our model's 'Predicted_Share' and assign numbers 1, 2, 3...
test = test.sort_values('Predicted_Share', ascending=False)
test['Predicted_Rank'] = range(1, len(test) + 1)

# 4. Compare
# top 10 candidates our model identified versus their actual rank
print("--- MVP Prediction vs Reality (2022-23 Season) ---")
display_cols = ['PLAYER_NAME', 'Share', 'Predicted_Share', 'Actual_Rank', 'Predicted_Rank']
print(test[display_cols].head(10))

--- MVP Prediction vs Reality (2022-23 Season) ---
                PLAYER_NAME  Share  Predicted_Share  Actual_Rank  \
1275       Domantas Sabonis  0.027         0.097456            7   
1544           Nikola Jokic  0.674         0.087991            2   
1426          Julius Randle  0.000         0.086326          282   
1311  Giannis Antetokounmpo  0.606         0.085465            3   
1383           Jayson Tatum  0.280         0.072490            4   
1546         Nikola Vucevic  0.000         0.071588          402   
1485            Luka Doncic  0.010         0.069960            8   
1396            Joel Embiid  0.915         0.067829            1   
1403      Jonas Valanciunas  0.000         0.067545          259   
1563          Pascal Siakam  0.000         0.063341          419   

      Predicted_Rank  
1275               1  
1544               2  
1426               3  
1311               4  
1383               5  
1546               6  
1485               7  
1396            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicted_Share'] = predictions
