# Scraping Data from [Basketball Reference](https://www.basketball-reference.com/)

This part of the project will be scraping data from the Basketball Reference website. I'll be scraping data from the past 20 years. The order of my scraping will be:
- MVP Data: I'm scraping this data because past finishes in the MVP table would be useful in building a machine learning algorithm to predict the 2023 MVP. 
- Player Statistics: Individual player statistics over time are also a good indicator of the level of player and their standing in the MVP race. 
- Team Standings data: Overall team performance is also a great indicator of who the MVP is. The MVP is usally on a winning team or playoff team. 

## Scraping the MVP standings from the past 20 years

In [29]:
# Import Packages for Scraping

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
years = list(range(1992, 2023))

In [11]:
# Source url for scraping data from basketball reference
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [17]:
# Scraping the MVP page
for year in years:
    url = url_start.format(year)  # grabs url for each year
    
    data = requests.get(url)  # grabbing data
    
    with open("mvps/{}.html".format(year), "w+") as f:
        f.write(data.text)  #

In [22]:
# Parsing the pages scraped
with open("mvps/1992.html") as f:
    page = f.read()


In [23]:
# Parsing the data
soup = BeautifulSoup(page, "html.parser")

In [26]:
soup.find("tr", class_ = "over_header").decompose()  # this removes the extra header row from the element that we are using. 

In [28]:
mvp_table = soup.find(id = "mvp")

In [32]:
# read the data into pandas 

mvp_1992 = pd.read_html(str(mvp_table))[0]

In [33]:
mvp_1992

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,28,CHI,80.0,900.0,960,0.938,80,38.8,30.1,6.4,6.1,2.3,0.9,0.519,0.27,0.832,17.7,0.274
1,2,Clyde Drexler,29,POR,12.0,561.0,960,0.584,76,36.2,25.0,6.6,6.7,1.8,0.9,0.47,0.337,0.794,12.8,0.223
2,3,David Robinson,26,SAS,2.0,337.0,960,0.351,68,37.7,23.2,12.2,2.7,2.3,4.5,0.551,0.125,0.701,13.9,0.26
3,4,Karl Malone,28,UTA,1.0,262.0,960,0.273,81,37.7,28.0,11.2,3.0,1.3,0.6,0.526,0.176,0.778,15.1,0.237
4,5,Patrick Ewing,29,NYK,0.0,100.0,960,0.104,82,38.4,24.0,11.2,1.9,1.1,3.0,0.522,0.167,0.738,13.0,0.198
5,6,Chris Mullin,28,GSW,0.0,81.0,960,0.084,81,41.3,25.6,5.6,3.5,2.1,0.8,0.524,0.366,0.833,10.8,0.155
6,7,Mark Price,27,CLE,0.0,66.0,960,0.069,72,29.7,17.3,2.4,7.4,1.3,0.2,0.488,0.387,0.947,9.1,0.205
7,8,Tim Hardaway,25,GSW,0.0,64.0,960,0.067,81,41.1,23.4,3.8,10.0,2.0,0.2,0.461,0.338,0.766,9.2,0.132
8,9,Scottie Pippen,26,CHI,1.0,32.0,960,0.033,82,38.6,21.0,7.7,7.0,1.9,1.1,0.506,0.2,0.76,12.7,0.192
9,10,Dennis Rodman,30,DET,0.0,26.0,960,0.027,82,40.3,9.8,18.7,2.3,0.8,0.9,0.539,0.317,0.6,12.6,0.183


In [39]:
# Pulling in all of the seasons

dfs = []
for year in years:
    with open("mvps/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    
    # append the df
    dfs.append(mvp_df)

In [46]:
# checking to see if the dfs were populated into the list
dfs[0]

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,28,CHI,80.0,900.0,960,0.938,80,38.8,...,6.4,6.1,2.3,0.9,0.519,0.27,0.832,17.7,0.274,1992
1,2,Clyde Drexler,29,POR,12.0,561.0,960,0.584,76,36.2,...,6.6,6.7,1.8,0.9,0.47,0.337,0.794,12.8,0.223,1992
2,3,David Robinson,26,SAS,2.0,337.0,960,0.351,68,37.7,...,12.2,2.7,2.3,4.5,0.551,0.125,0.701,13.9,0.26,1992
3,4,Karl Malone,28,UTA,1.0,262.0,960,0.273,81,37.7,...,11.2,3.0,1.3,0.6,0.526,0.176,0.778,15.1,0.237,1992
4,5,Patrick Ewing,29,NYK,0.0,100.0,960,0.104,82,38.4,...,11.2,1.9,1.1,3.0,0.522,0.167,0.738,13.0,0.198,1992
5,6,Chris Mullin,28,GSW,0.0,81.0,960,0.084,81,41.3,...,5.6,3.5,2.1,0.8,0.524,0.366,0.833,10.8,0.155,1992
6,7,Mark Price,27,CLE,0.0,66.0,960,0.069,72,29.7,...,2.4,7.4,1.3,0.2,0.488,0.387,0.947,9.1,0.205,1992
7,8,Tim Hardaway,25,GSW,0.0,64.0,960,0.067,81,41.1,...,3.8,10.0,2.0,0.2,0.461,0.338,0.766,9.2,0.132,1992
8,9,Scottie Pippen,26,CHI,1.0,32.0,960,0.033,82,38.6,...,7.7,7.0,1.9,1.1,0.506,0.2,0.76,12.7,0.192,1992
9,10,Dennis Rodman,30,DET,0.0,26.0,960,0.027,82,40.3,...,18.7,2.3,0.8,0.9,0.539,0.317,0.6,12.6,0.183,1992


In [47]:
# Combining mvp votes
mvps = pd.concat(dfs)

In [52]:
# checking to see if combining dfs worked
mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
7,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,34.5,...,5.2,6.3,1.3,0.4,0.437,0.38,0.923,8.0,0.173,2022
8,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,32.9,...,4.4,10.8,1.9,0.3,0.493,0.317,0.837,9.4,0.21,2022
9,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,5.2,4.9,0.9,0.3,0.504,0.352,0.877,8.8,0.154,2022
10,10T,Kevin Durant,33,BRK,0.0,1.0,1000,0.001,55,37.2,...,7.4,6.4,0.9,0.9,0.518,0.383,0.91,8.4,0.198,2022
11,10T,LeBron James,37,LAL,0.0,1.0,1000,0.001,56,37.2,...,8.2,6.2,1.3,1.1,0.524,0.359,0.756,7.5,0.172,2022


In [53]:
# Creating a csv file with our mvps data
mvps.to_csv("mvps.csv")

# Using Selenium to scrape player statistics from the past 20 years.
Selenium is needed because the basketball reference uses Javascript to render various parts of the table. 

In [55]:
# Gathering statistics for each player
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

# Downloading stats for a single year, 1992
url = player_stats_url.format(1992)
data = requests.get(url)
with open(("player/1992.html"), "w+") as f:
    f.write(data.text)

In [68]:
# Import Webdriver to render javascript pages
from selenium import webdriver
driver = webdriver.Chrome(executable_path="/Users/drewpy/Downloads/chromedriver3")

  driver = webdriver.Chrome(executable_path="/Users/drewpy/Downloads/chromedriver3")


In [69]:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# install selenium chrome driver from https://chromedriver.chromium.org/downloads
# xattr -d com.apple.quarantine chromedriver

In [70]:
import time 
year = 1992

url = player_stats_url.format(year)
driver.get(url)
driver.execute_script("window.scrollTo(1, 10000)")

time.sleep(2)

html = driver.page_source

In [72]:
with open("player/{}.html".format(year), "w+") as f:
    f.write(html)

In [79]:
# Write all of this information to a file
for year in years:

    url = player_stats_url.format(year)
    driver.get(url)
    driver.execute_script("window.scrollTo(1, 10000)")
    time.sleep(2)
    
    html = driver.page_source
    
    with open("player/{}.html".format(year), "w+") as f:
        f.write(html)

In [81]:
# Parsing statistics with BS
dfs =[]

for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    
    dfs.append(player_df)

In [91]:
# Combining df's usihng pandas
players = pd.concat(dfs)
players.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
836,601,Thaddeus Young,PF,33,TOR,26,0,18.3,2.6,5.5,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,2022
837,602,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022
838,603,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022
839,604,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022
840,605,Ivica Zubac,C,24,LAC,76,76,24.4,4.1,6.5,...,2.9,5.6,8.5,1.6,0.5,1.0,1.5,2.7,10.3,2022


In [92]:
# Write the players file to csv
players.to_csv("players.csv")

## Scraping Team Standings from the Past 20 years from the various divisions in the NBA


In [94]:
# Demo scrape for teams
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
year = 1992
url = team_stats_url.format(year)

In [95]:
data = requests.get(url)

In [97]:
with open("team/{}.html".format(year), "w+") as f:
    f.write(data.text)

In [98]:
# Loop through the data to grab the division standings tables

for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)
    
    with open("team/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [112]:
# Parsing team data with BS4
dfs = []

for year in years:
    
    with open("team/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    w_table = soup.find_all(id="divs_standings_W")
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)
    
    
    

In [113]:
# Checking the output
dfs[0]

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,51,31,.622,—,106.6,103.0,3.41,1992,Boston Celtics*
1,51,31,.622,—,101.6,97.7,3.67,1992,New York Knicks*
2,40,42,.488,11.0,105.4,107.1,-1.54,1992,New Jersey Nets*
3,38,44,.463,13.0,105.0,109.2,-3.94,1992,Miami Heat*
4,35,47,.427,16.0,101.9,103.2,-1.34,1992,Philadelphia 76ers
5,25,57,.305,26.0,102.4,106.8,-4.35,1992,Washington Bullets
6,21,61,.256,30.0,101.6,108.5,-6.52,1992,Orlando Magic
7,Central Division,Central Division,Central Division,Central Division,Central Division,Central Division,Central Division,1992,Central Division
8,67,15,.817,—,109.9,99.5,10.07,1992,Chicago Bulls*
9,57,25,.695,10.0,108.9,103.4,5.34,1992,Cleveland Cavaliers*


In [114]:
# Concatenate the df's into one table
teams = pd.concat(dfs)
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,51,31,0.622,—,106.6,103.0,3.41,1992,Boston Celtics*
1,51,31,0.622,—,101.6,97.7,3.67,1992,New York Knicks*
2,40,42,0.488,11.0,105.4,107.1,-1.54,1992,New Jersey Nets*
3,38,44,0.463,13.0,105.0,109.2,-3.94,1992,Miami Heat*
4,35,47,0.427,16.0,101.9,103.2,-1.34,1992,Philadelphia 76ers


In [115]:
# Write to csv

teams.to_csv("teams.csv")