In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date 
from tqdm import tqdm
import pickle
import json

import requests
from urllib.error import HTTPError
import urllib.request
from bs4 import BeautifulSoup

from basketball_reference_web_scraper import client

import warnings
warnings.filterwarnings('ignore')

#pandas show all columns
pd.set_option('display.max_columns', None)

## extract historical master table

In [2]:
max_year = date.today().year
years = [year for year in range(1980, max_year)]

In [3]:
os.path.dirname(os.getcwd()) + '/scripts' + '/team_to_abbreviations.json'

'/Users/dbtjdals/Desktop/nba_mvp_project/scripts/team_to_abbreviations.json'

In [4]:
#load team to abbreviation mapping
with open(os.path.dirname(os.getcwd()) + '/scripts' + '/team_to_abbreviations.json') as f:
      team_to_abbreviations = json.load(f)

In [5]:
def extract_mvp_candidates(year):
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html#mvp"
    try:
        mvp_candidate_table = pd.read_html(url)[0].droplevel(level=0, axis = 1)
        mvp_candidate_table['year'] = year
    except HTTPError as err:
        print(f'no mvp race has been found for year {year}')
    return mvp_candidate_table

In [6]:
def extract_team_stats(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html#all_confs_standings_E%22"

    #east
    team_east_standing_table = pd.read_html(url)[0]
    team_east_standing_table = team_east_standing_table.rename({'Eastern Conference': 'team'}, axis=1)
    #remove 'Division' in team column (e.g. Atlantic Division)
    team_east_standing_table = team_east_standing_table[team_east_standing_table['team'].str.contains('Division')==False]
    team_east_standing_table['seed'] = team_east_standing_table['W'].rank(ascending=False)

    #west
    team_west_standing_table = pd.read_html(url)[1]
    team_west_standing_table = team_west_standing_table.rename({'Western Conference': 'team'}, axis=1)
    #remove 'Division' in team column (e.g. Atlantic Division)
    team_west_standing_table = team_west_standing_table[team_west_standing_table['team'].str.contains('Division')==False]
    team_west_standing_table['seed'] = team_west_standing_table['W'].rank(ascending=False)

    #combine east and west 
    team_standing_table = pd.concat([team_east_standing_table, team_west_standing_table])
    
    #remove * in team column
    team_standing_table.team = team_standing_table.team.str.replace('*', '')
    
    #change player name string if current year (different formatting)
    if year == max_year:
        team_name_lst = []
        seeds = team_standing_table['seed']
        for seed, team in zip(list(seeds), list(team_standing_table['team'])): 
            if len(str(seed)) != 3:
                team_name = team[:-5]
            else:
                team_name = team[:-4]
            team_name_lst.append(team_name)
        
        team_standing_table['team'] = team_name_lst
    
    #map abbreviation to full team name
    team_standing_table['Tm'] = team_standing_table['team'].map(team_to_abbreviations)
    
    #filter only needed columns
    team_standing_table_sub = team_standing_table[['Tm', 'team', 'W', 'W/L%', 'seed']]
    
    return team_standing_table_sub

In [7]:
filter_advanced = [
        'name',
        'player_efficiency_rating',
         'true_shooting_percentage',
         'three_point_attempt_rate',
         'free_throw_attempt_rate',
         'offensive_rebound_percentage',
         'defensive_rebound_percentage',
         'total_rebound_percentage',
         'assist_percentage',
         'steal_percentage',
         'block_percentage',
         'turnover_percentage',
         'usage_percentage',
         'offensive_win_shares',
         'defensive_win_shares',
         'win_shares',
         'win_shares_per_48_minutes',
         'offensive_box_plus_minus',
         'defensive_box_plus_minus',
         'box_plus_minus',
         'value_over_replacement_player'
    ]

def extract_advanced_stats(year):
    advanced_stats_df = pd.DataFrame(client.players_advanced_season_totals(season_end_year=year))
    advanced_stats_df['year'] = year
    
    advanced_stats_df = advanced_stats_df[filter_advanced]
    advanced_stats_df = advanced_stats_df.rename(columns={'name':'Player'})
    return advanced_stats_df

In [8]:
tables = []

print('extracting historical data of NBA MVP candidates..')
for year in tqdm(years):
    mvp_candidate_table = extract_mvp_candidates(year)
    team_standing_table_sub = extract_team_stats(year)
    
    #left merge mvp candidate with team standings table on team abbreviation
    table = pd.merge(mvp_candidate_table, team_standing_table_sub, how='left', on='Tm')
    
    #add advanced stats
    advanced_stats_df = extract_advanced_stats(year)
    
    #left merge mvp candidate with team standings table on team abbreviation
    table = pd.merge(table, advanced_stats_df, how='left', on='Player')

    #append to list of tables
    tables.append(table)
print('complete')

extracting historical data of NBA MVP candidates..


100%|██████████| 42/42 [01:20<00:00,  1.92s/it]

complete





In [9]:
master_table = pd.concat(tables)

In [10]:
master_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 693 entries, 0 to 16
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Rank                           693 non-null    object 
 1   Player                         693 non-null    object 
 2   Age                            693 non-null    int64  
 3   Tm                             693 non-null    object 
 4   First                          693 non-null    float64
 5   Pts Won                        693 non-null    float64
 6   Pts Max                        693 non-null    int64  
 7   Share                          693 non-null    float64
 8   G                              693 non-null    int64  
 9   MP                             693 non-null    float64
 10  PTS                            693 non-null    float64
 11  TRB                            693 non-null    float64
 12  AST                            693 non-null    floa

In [11]:
#check rows of NaN seeding stats
master_table[master_table['team'].isna()]

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year,team,W,W/L%,seed,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
15,11T,Dominique Wilkins,34,TOT,0.0,1.0,1010,0.001,74,35.6,26.0,6.5,2.3,1.2,0.4,0.44,0.288,0.847,8.4,0.153,1994,,,,,21.1,0.526,0.199,0.323,7.7,12.2,9.9,11.9,1.9,0.8,9.5,32.0,3.5,2.3,5.8,0.166,3.9,-1.1,2.7,2.0
16,11T,Dominique Wilkins,34,TOT,0.0,1.0,1010,0.001,74,35.6,26.0,6.5,2.3,1.2,0.4,0.44,0.288,0.847,8.4,0.153,1994,,,,,21.8,0.535,0.164,0.338,7.1,12.5,9.9,10.3,1.5,0.5,7.1,31.9,2.1,0.5,2.6,0.13,4.7,-2.5,2.2,1.0
13,14,Clyde Drexler,32,TOT,0.0,3.0,1050,0.003,76,35.9,21.8,6.3,4.8,1.8,0.6,0.461,0.36,0.824,11.7,0.206,1995,,,,,22.7,0.55,0.337,0.348,6.4,12.4,9.3,24.2,2.7,1.2,10.6,27.2,4.4,2.1,6.5,0.217,6.0,1.8,7.9,3.6
14,14,Clyde Drexler,32,TOT,0.0,3.0,1050,0.003,76,35.9,21.8,6.3,4.8,1.8,0.6,0.461,0.36,0.824,11.7,0.206,1995,,,,,22.1,0.613,0.319,0.369,6.3,15.0,10.9,19.9,2.4,1.2,12.7,24.2,3.6,1.7,5.2,0.193,4.5,0.5,5.0,2.3
12,13,Vince Carter,28,TOT,0.0,3.0,1270,0.002,77,36.7,24.5,5.2,4.2,1.4,0.6,0.452,0.406,0.798,9.4,0.159,2005,,,,,17.0,0.475,0.195,0.238,5.1,7.2,6.1,18.5,2.1,1.8,6.2,26.7,0.5,0.4,0.9,0.068,1.9,0.4,2.3,0.7
13,13,Vince Carter,28,TOT,0.0,3.0,1270,0.002,77,36.7,24.5,5.2,4.2,1.4,0.6,0.452,0.406,0.798,9.4,0.159,2005,,,,,24.5,0.556,0.205,0.313,4.2,14.1,9.1,27.6,2.0,1.2,9.4,32.9,5.4,3.1,8.5,0.184,6.2,0.8,7.0,5.1
5,6,Chauncey Billups,32,TOT,0.0,33.0,1210,0.027,79,35.3,17.7,3.0,6.4,1.2,0.2,0.418,0.408,0.913,10.1,0.174,2009,,,,,17.5,0.492,0.333,0.476,1.7,15.5,8.5,33.0,2.4,1.1,13.6,20.0,0.1,0.1,0.2,0.124,0.4,0.5,1.0,0.1
6,6,Chauncey Billups,32,TOT,0.0,33.0,1210,0.027,79,35.3,17.7,3.0,6.4,1.2,0.2,0.418,0.408,0.913,10.1,0.174,2009,,,,,18.8,0.595,0.407,0.467,1.4,8.2,4.9,28.8,1.7,0.5,13.0,21.8,7.8,2.1,9.9,0.176,3.9,-0.5,3.4,3.7
13,12T,Stephen Jackson,31,TOT,0.0,1.0,1230,0.001,81,38.6,20.6,5.0,3.7,1.6,0.5,0.423,0.328,0.779,5.0,0.077,2010,,,,,14.5,0.499,0.301,0.278,2.2,10.9,6.5,21.3,2.2,1.4,14.8,25.0,-0.1,0.2,0.0,0.008,0.2,-1.5,-1.3,0.1
14,12T,Stephen Jackson,31,TOT,0.0,1.0,1230,0.001,81,38.6,20.6,5.0,3.7,1.6,0.5,0.423,0.328,0.779,5.0,0.077,2010,,,,,15.7,0.52,0.275,0.309,3.1,12.3,7.8,17.3,2.2,1.1,13.8,27.8,0.6,4.4,5.0,0.085,0.3,0.3,0.6,1.8


players who have been traded mid-season tend to not receive many MVP votes; they can be removed from the dataset

In [8]:
#drop rows of players who were traded mid season
master_table = master_table[master_table['Tm'].str.contains('TOT')==False]

#fill na in 3P%
master_table['3P%'] = master_table['3P%'].fillna(0)

NameError: name 'master_table' is not defined

In [9]:
data_path = os.path.dirname(os.getcwd()) + '/data' + '/master_table.csv'
print(data_path)
master_table.to_csv(data_path, index=False)

/Users/dbtjdals/Desktop/nba_mvp_project/data/master_table.csv


NameError: name 'master_table' is not defined

## 2022 mvp candidates forecasting

In [10]:
year = 2022
basic_stats_df = pd.DataFrame(client.players_season_totals(season_end_year=year))

advanced_stats_df = pd.DataFrame(client.players_advanced_season_totals(season_end_year=year))
advanced_stats_df = advanced_stats_df[filter_advanced]
advanced_stats_df = advanced_stats_df.rename(columns={'name':'Player'})

In [11]:
filter_basic = [
    'name',
    'games_played',
    'team',
    'points',
    'assists',
    'offensive_rebounds',
    'defensive_rebounds',
    'steals',
    'blocks',
    'made_field_goals',
    'attempted_field_goals',
    'made_three_point_field_goals',
    'attempted_three_point_field_goals'
]
basic_stats_df = basic_stats_df[filter_basic]
basic_stats_df = basic_stats_df.rename(columns={'name':'Player',
                               'points':'PTS',
                               'assists':'AST',
                               'steals':'STL',
                               'blocks':'BLK',})

basic_stats_df['PTS'] = basic_stats_df['PTS'] / basic_stats_df['games_played']
basic_stats_df['AST'] = basic_stats_df['AST'] / basic_stats_df['games_played']
basic_stats_df['STL'] = basic_stats_df['STL'] / basic_stats_df['games_played']
basic_stats_df['BLK'] = basic_stats_df['BLK'] / basic_stats_df['games_played']
basic_stats_df['FG%'] = basic_stats_df['made_field_goals'] / basic_stats_df['attempted_field_goals']
basic_stats_df['3P%'] = basic_stats_df['made_three_point_field_goals'] / basic_stats_df['attempted_three_point_field_goals']
basic_stats_df['TRB'] = (basic_stats_df['offensive_rebounds'] + basic_stats_df['defensive_rebounds']) / basic_stats_df['games_played']

basic_stats_df = basic_stats_df[['Player', 'games_played', 'team', 'PTS', 'AST', 'STL', 'BLK', 'FG%' ,'3P%' , 'TRB']]
basic_stats_df['team'] = basic_stats_df['team'].astype(str).str.slice(5,)
basic_stats_df['team'] = basic_stats_df['team'].astype(str).str.replace('_', ' ').str.lower()

In [12]:
######## team stats are formatted differently for ongoing seasons; use extract_team_stats_new to extract team stats for the current season

In [13]:
def extract_team_stats_new(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html#all_confs_standings_E%22"

    #east
    team_east_standing_table = pd.read_html(url)[0]
    team_east_standing_table = team_east_standing_table.rename({'Eastern Conference': 'team'}, axis=1)
    #remove 'Division' in team column (e.g. Atlantic Division)
    team_east_standing_table = team_east_standing_table[team_east_standing_table['team'].str.contains('Division')==False]
    new_east = team_east_standing_table["team"].str.split("(", n = 1, expand = True)
    team_east_standing_table["team"] = [i[:-1] for i in new_east[0]]
    team_east_standing_table["seed"] = [i[:-1] for i in new_east[1]]

    #west
    team_west_standing_table = pd.read_html(url)[1]
    team_west_standing_table = team_west_standing_table.rename({'Western Conference': 'team'}, axis=1)
    #remove 'Division' in team column (e.g. Atlantic Division)
    team_west_standing_table = team_west_standing_table[team_west_standing_table['team'].str.contains('Division')==False]
    new_west = team_west_standing_table["team"].str.split("(", n = 1, expand = True)
    team_west_standing_table["team"] = [i[:-1] for i in new_west[0]]
    team_west_standing_table["seed"] = [i[:-1] for i in new_west[1]]

    #combine east and west 
    team_standing_table = pd.concat([team_east_standing_table, team_west_standing_table])
    
    #remove * in team column
    team_standing_table.team = team_standing_table.team.str.replace('*', '')
        
    #map abbreviation to full team name
    team_standing_table['Tm'] = team_standing_table['team'].map(team_to_abbreviations)
    
    #filter only needed columns
    team_standing_table_sub = team_standing_table[['Tm', 'team', 'W', 'W/L%', 'seed']]
    
    return team_standing_table_sub

In [14]:
#left merge mvp candidate with team standings table on team abbreviation
team_standing_table_sub = extract_team_stats_new(2022)
team_standing_table_sub['team'] = team_standing_table_sub['team'].str.lower()
joined_table_2022 = pd.merge(basic_stats_df, team_standing_table_sub, how='left', on='team')
joined_table_2022 = pd.merge(joined_table_2022, advanced_stats_df, how='left', on='Player')

joined_table_2022.loc[joined_table_2022['Player'] == 'Nikola Jokić', 'Player'] = 'Nikola Jokic'

In [24]:
#2022 candidate table
url = 'https://www.nba.com/news/kia-mvp-ladder-jan-21-2022-edition'
html = requests.get(url).content

soup = BeautifulSoup(html)
remove_line = 'Last week’s ranking'
remove_line2 = 'ET'
remove_line3 = 'The Next Five'

top_five = []
next_five = [] 

for line in soup.find_all("strong")[1:-1]:
    if remove_line not in str(line):
        if remove_line2 not in str(line):
            if remove_line3 not in str(line):
                name_raw = str(line).split(',')[0]
                name_raw = name_raw.split('.')[1]
                name = name_raw[1:]
                top_five.append(name)

for line in soup.find_all("p"):
    if 'week: ' in str(line):
        name_raw = str(line).split(',')[0]
        name_raw = name_raw.split('.')[1]
        name = name_raw[1:]
        #name = name.split('>')[1][1:]
        next_five.append(name)
    
top_ten = top_five + next_five

In [25]:
top_ten

['Nikola Jokic',
 'Giannis Antetokounmpo',
 'Joel Embiid',
 'Kevin Durant',
 'Chris Paul',
 'Stephen Curry',
 'DeMar DeRozan',
 'Ja Morant',
 'Rudy Gobert',
 'LeBron James']

In [26]:
#manual 2/11/2022
top_ten = [
    'Nikola Jokic',
    'Joel Embiid',
    'Giannis Antetokounmpo',
    'Chris Paul',
    'Stephen Curry',
    'Ja Morant',
    'Luka Doncic',
    'Devin Booker',
    'DeMar DeRozan',
    'Jimmy Butler'
]

In [27]:
joined_table_2022 = joined_table_2022[joined_table_2022['Player'].isin(top_ten)]

In [28]:
joined_table_2022.columns

Index(['Player', 'games_played', 'team', 'PTS', 'AST', 'STL', 'BLK', 'FG%',
       '3P%', 'TRB', 'Tm', 'W', 'W/L%', 'seed', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player'],
      dtype='object')

In [29]:
# adjust VORP at the current pace and project to rest of the season
joined_table_2022_sub = joined_table_2022[['Player', 'games_played', 'value_over_replacement_player']]
joined_table_2022_sub['games'] = 82
joined_table_2022_sub['games_left'] = (joined_table_2022_sub['games'] - joined_table_2022_sub['games_played'])
joined_table_2022_sub['vorp/games_played'] = (joined_table_2022_sub['value_over_replacement_player'] / joined_table_2022_sub['games_played'])
joined_table_2022_sub['adjusted_vorp'] = (joined_table_2022_sub['vorp/games_played'] * joined_table_2022_sub['games_left']) + joined_table_2022_sub['value_over_replacement_player']


In [30]:
joined_table_2022['value_over_replacement_player'] = joined_table_2022_sub['adjusted_vorp']

In [31]:
data_path_2022 = os.path.dirname(os.getcwd()) + '/data' + '/data_2022.csv'
joined_table_2022.to_csv(data_path_2022, index=False)

In [32]:
data_path_2022

'/Users/dbtjdals/Desktop/nba_mvp_project/data/data_2022.csv'

In [33]:
joined_table_2022

Unnamed: 0,Player,games_played,team,PTS,AST,STL,BLK,FG%,3P%,TRB,Tm,W,W/L%,seed,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,total_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player
17,Giannis Antetokounmpo,47,milwaukee bucks,28.957447,6.021277,0.957447,1.425532,0.540603,0.298343,11.191489,MIL,35,0.603,5,31.8,0.623,0.21,0.606,6.1,29.7,18.1,34.1,1.4,4.3,12.5,35.1,6.4,2.7,9.0,0.284,7.5,3.5,11.0,8.723404
66,Devin Booker,49,phoenix suns,25.510204,4.387755,0.959184,0.326531,0.446516,0.369318,5.285714,PHO,46,0.821,1,20.0,0.555,0.345,0.239,2.6,13.7,8.3,20.7,1.4,0.9,9.2,32.2,2.5,2.3,4.8,0.139,3.1,-0.3,2.8,3.346939
100,Jimmy Butler,38,miami heat,21.736842,6.026316,1.815789,0.447368,0.491007,0.205882,6.236842,MIA,37,0.649,1,25.8,0.601,0.122,0.536,6.4,14.3,10.4,29.7,2.7,1.4,10.5,26.5,5.1,2.0,7.1,0.264,5.7,2.4,8.1,6.905263
153,Stephen Curry,53,golden state warriors,25.811321,6.339623,1.396226,0.396226,0.425409,0.381098,5.339623,GSW,42,0.724,2,21.7,0.594,0.631,0.245,1.9,14.3,8.3,30.7,2.0,1.1,13.1,31.3,4.1,3.0,7.1,0.185,5.7,0.5,6.2,5.879245
161,DeMar DeRozan,54,chicago bulls,27.944444,5.12963,0.851852,0.314815,0.515464,0.336735,5.203704,CHI,37,0.638,2,24.7,0.6,0.092,0.408,2.6,13.5,8.1,24.6,1.2,0.8,9.0,31.7,6.3,1.3,7.6,0.19,4.3,-1.0,3.3,3.948148
196,Joel Embiid,44,philadelphia 76ers,29.545455,4.454545,1.045455,1.477273,0.492415,0.361963,11.159091,PHI,34,0.607,4,31.8,0.607,0.19,0.569,6.9,30.2,18.7,25.6,1.6,4.0,10.7,37.3,5.3,2.7,8.0,0.264,7.6,2.3,9.9,8.2
399,Nikola Jokic,51,denver nuggets,25.823529,7.882353,1.352941,0.745098,0.570295,0.366516,13.745098,DEN,32,0.561,6,32.6,0.654,0.251,0.321,9.7,36.0,23.1,43.4,2.0,2.1,16.0,31.4,7.4,3.1,10.4,0.297,9.3,4.6,14.0,10.933333
527,Ja Morant,45,memphis grizzlies,26.422222,6.777778,1.244444,0.377778,0.493407,0.334975,5.866667,MEM,40,0.69,3,24.6,0.571,0.223,0.327,4.3,14.5,9.3,34.8,1.8,1.0,13.0,33.2,3.7,1.6,5.3,0.17,5.9,-0.2,5.7,5.284444
577,Chris Paul,56,phoenix suns,14.964286,10.696429,1.875,0.321429,0.486111,0.335227,4.535714,PHO,46,0.821,1,21.1,0.577,0.272,0.275,0.9,13.2,7.2,44.2,2.7,0.9,15.4,19.9,5.3,3.2,8.5,0.219,3.1,2.2,5.3,4.978571
