
## Get historical MVP voting and stats dataset


In [78]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import requests
import re
# from selenium import webdriver
from collections import defaultdict

# data sourced from https://www.basketball-reference.com
# web scrapper guide: https://towardsdatascience.com/predicting-2018-19-nbas-most-valuable-player-using-machine-learning-512e577032e3

def get_voting_and_team_stats(url, season):
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    # column headers
    soup.findAll("tr", limit=2)

    #headers = [th.getText() for th in soup.findAll("tr", limit=2)[1].findAll("th")]
    # print(headers)

    players = defaultdict(list)
    
    table = soup.find(attrs={"id": "mvp"})
    rows = table.findAll("tr")

    for index, row in enumerate(rows):

        tds = row.findAll("td")

        for td in tds:
            
            # avoid err
            if "data-stat" not in td.attrs:
                continue
            
            # don't want this int value, avoid err
            if td["data-stat"] == 'age':
                continue

            if "data-stat" in td.attrs and td["data-stat"] == "team_id":

                # get team win_pct as some some lockout outs, where std 82 wasn't met
                try:
                    params = td.find("a")["href"]
                except Exception:
                    # IN this instance, a player played for two or more teams,
                    # so bball-reference doesn't have a link to thats team's year
                    players["win_pct"].append(0.0)
                    continue
                url_team = "https://www.basketball-reference.com" + params
                res = requests.get(url_team)
                soup = BeautifulSoup(res.text)

                ps = soup.findAll("p");
                for p in ps:
                    if "Record" in p.text:
                        # XX-XX
                        temp = re.findall("\d+\-\d+", p.text)[0]
                        nums = temp.split("-")
                        record = float(nums[0]) / (float(nums[0]) + float(nums[1]))
                        players["win_pct"].append(record)
                        break
                continue

            if "data-stat" in td.attrs and td["data-stat"] == "player":
                # get general per-game and advanced stats for a player
                params = td.find("a")["href"]
                url_player = "https://www.basketball-reference.com" + params
                advanced = get_player_stats(url_player, season)
                #print(advanced)

                for key in advanced:
                        players[key].append(advanced[key])

                players[td["data-stat"]].append(td.getText())

            else:
                # get MVP voting data, and general stats (ppg, apg, spg, etc..)
                value = td.getText() or "0"
                stat = td["data-stat"]
                players[stat].append(float(value))

    return players


def get_advanced_stats(url, season):
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    
    data = {}
    rows = ""
    
    table = soup.find(attrs={"id": "all_advanced"})
    id_year = "advanced." + season

    for child in table.children:
    
        if "table_container" in child:
            rows = BeautifulSoup(child).findAll("tr")
 
    for row in rows:
        
        # get correct season, get bunch of advanced stats (can drop later)
        if "id" in row.attrs and row.attrs["id"] == id_year:
            data.update({
                "season": str(int(season)-1) + "-" + season[-2:],
                "per": float(row.find("td", attrs={"data-stat": "per"}).text),
                "ts_pct": float(row.find("td", attrs={"data-stat": "ts_pct"}).text),
                "usg_pct": float(row.find("td", attrs={"data-stat": "usg_pct"}).text),
                "bpm": float(row.find("td", attrs={"data-stat": "bpm"}).text),
                # "tov_pct": float(row.find("td", attrs={"data-stat": "tov_pct"}).text),
                "vorp": float(row.find("td", attrs={'data-stat': "vorp"}).text),   
        })
        
    return data



# range is 1980-1981 to 2019-2020 season
seasons = range(1981, 2021)

historical = defaultdict(list)

for season in seasons:
    url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    temp = get_voting_and_team_stats(url, str(season))
    for key in temp:
        historical[key].extend(temp[key])
    
# save to csv file
df = pd.DataFrame(historical)
first = df.pop("player")
df.insert(0, "player", first); 
#print(df)
df.to_csv("historical.csv")

## Get current stats dataset based on the most recent NBA MVP ladder

In [201]:
# similar to structure above, but need general and advanced stats

def get_general_and_advanced_stats(url, season):
    
    print(url)
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    player = defaultdict(list)
    
    table = soup.find(attrs={"id": "all_per_game"})
    id_year = "per_game." + season
    
    # get general stats
    rows =  table.findAll("tr")
    for row in rows:
        if "id" in row.attrs and row.attrs["id"] == id_year:
            player["g"] = float(row.find("td", attrs={"data-stat": "g"}).text)
            player["mp_per_g"] = float(row.find("td", attrs={"data-stat": "mp_per_g"}).text)
            player["pts_per_g"] = float(row.find("td", attrs={"data-stat": "pts_per_g"}).text)
            player["trb_per_g"] = float(row.find("td", attrs={"data-stat": "trb_per_g"}).text)
            player["ast_per_g"] = float(row.find("td", attrs={"data-stat": "ast_per_g"}).text)
            player["stl_per_g"] = float(row.find("td", attrs={"data-stat": "stl_per_g"}).text)
            player["blk_per_g"] = float(row.find("td", attrs={"data-stat": "blk_per_g"}).text)
            player["fg_pct"] = float(row.find("td", attrs={"data-stat": "fg_pct"}).text)
            player["fg3_pct"] = float(row.find("td", attrs={"data-stat": "fg3_pct"}).text)
            player["ft_pct"] = float(row.find("td", attrs={"data-stat": "ft_pct"}).text)
            
            
            try:
                team = row.find("td", attrs={"data-stat": "team_id"}).find("a")['href']
            except Exception:
                player["win_pct"] = float(0)
                continue
            
#             team = row.find("td", attrs={"data-stat": "team_id"}).find("a")['href']
            url_team = "https://www.basketball-reference.com" + team
            res = requests.get(url_team)
            soup = BeautifulSoup(res.text)

            ps = soup.findAll("p");
            for p in ps:
                if "Record" in p.text:
                    # XX-XX
                    temp = re.findall("\d+\-\d+", p.text)[0]
                    nums = temp.split("-")
                    record = float(nums[0]) / (float(nums[0]) + float(nums[1]))
                    player["win_pct"] = float(record)
                    break
            continue
    
    # get advanced stats
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    
    table = soup.find(attrs={"id": "all_advanced"})
    id_year = "advanced." + season

    for child in table.children:
    
        if "table_container" in child:
            rows = BeautifulSoup(child).findAll("tr")
            
    for row in rows:
        
        if "id" in row.attrs and row.attrs["id"] == id_year:
            player.update({
                "season": str(int(season)-1) + "-" + season[-2:],
                "per": float(row.find("td", attrs={"data-stat": "per"}).text),
                "ts_pct": float(row.find("td", attrs={"data-stat": "ts_pct"}).text),
                "usg_pct": float(row.find("td", attrs={"data-stat": "usg_pct"}).text),
                "bpm": float(row.find("td", attrs={"data-stat": "bpm"}).text),
                # "tov_pct": float(row.find("td", attrs={"data-stat": "tov_pct"}).text),
                "vorp": float(row.find("td", attrs={'data-stat': "vorp"}).text), 
                "ws": float(row.find("td", attrs={'data-stat': "ws"}).text), 
                "ws_per_48": float(row.find("td", attrs={'data-stat': "ws_per_48"}).text), 
                
        })
        
    return player


def get_param(name):
    
    temp = name.split(" ")
    
    first = temp[1][0]
    second = temp[1][0:5]
    second += temp[0][0:2]
    
    dict = {
        "first": first.lower(),
        "second": second.lower()
    }
    
    return dict


# NBA.com's current ladder of players (April 2, 2021) 
players = ["Nikola Jokic", "Damian Lillard", "James Harden", "Giannis Antetokounmpo", "Luka Doncic", 
          "LeBron James", "Joel Embiid", "Kawhi Leonard", "Kyrie Irving", "Devin Booker"]
season = "2021"

current = defaultdict(list)
frames = []

# get player data, add frame to frames, end concat frames to get one csv
for index, player in enumerate(players):
    params = get_param(player)
    url = "https://www.basketball-reference.com/players/{first}/{second}01.html".format(**params)
    #print(url)
    temp = get_general_and_advanced_stats(url, str(season))
    temp["player"] = player
    frame = pd.DataFrame(temp, index=[0])
    frames.append(frame)
    
df = pd.concat(frames)
first = df.pop("player")
df.insert(0, "player", first); 
df.to_csv("current.csv")

https://www.basketball-reference.com/players/j/jokicni01.html
https://www.basketball-reference.com/players/l/lillada01.html
https://www.basketball-reference.com/players/h/hardeja01.html
https://www.basketball-reference.com/players/a/antetgi01.html
https://www.basketball-reference.com/players/d/doncilu01.html
https://www.basketball-reference.com/players/j/jamesle01.html
https://www.basketball-reference.com/players/e/embiijo01.html
https://www.basketball-reference.com/players/l/leonaka01.html
https://www.basketball-reference.com/players/i/irvinky01.html
https://www.basketball-reference.com/players/b/bookede01.html
