### Install Libraries

In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install selenium

import requests
from bs4 import BeautifulSoup
import pandas as pd
from os import stat
from time import sleep
from selenium import webdriver



### Iterate through NBA seasons

In [2]:
years = range(1991, 2022) # Years 1991-2021

In [3]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [4]:
for year in years:
    url = url_start.format(year)
    
    # Anti-timeout
    if stat("mvp/{}.html".format(year)).st_size != 0:
        print(year, "exists and is not empty.")
        continue
    else: 
        data = requests.get(url)

        with open("mvp/{}.html".format(year), "w+", encoding="utf-8") as f:
                f.write(data.text)
        
        # Anti-timeout
        sleep(2)
        

1991 exists and is not empty.
1992 exists and is not empty.
1993 exists and is not empty.
1994 exists and is not empty.
1995 exists and is not empty.
1996 exists and is not empty.
1997 exists and is not empty.
1998 exists and is not empty.
1999 exists and is not empty.
2000 exists and is not empty.
2001 exists and is not empty.
2002 exists and is not empty.
2003 exists and is not empty.
2004 exists and is not empty.
2005 exists and is not empty.
2006 exists and is not empty.
2007 exists and is not empty.
2008 exists and is not empty.
2009 exists and is not empty.
2010 exists and is not empty.
2011 exists and is not empty.
2012 exists and is not empty.
2013 exists and is not empty.
2014 exists and is not empty.
2015 exists and is not empty.
2016 exists and is not empty.
2017 exists and is not empty.
2018 exists and is not empty.
2019 exists and is not empty.
2020 exists and is not empty.
2021 exists and is not empty.


### Parse with BeautifulSoup

In [5]:
dfs = []

for year in years:
    with open("mvp/{}.html".format(year), "r", encoding="utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    # Remove extraneous info
    try:
        soup.find('tr', class_="over_header").decompose()
    except:
        print(year, 'skipped.')
        continue

    # Populate list of dataframes
    mvp_table = soup.find(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year

    dfs.append(mvp)

### Concat into one DataFrame

In [6]:
mvps = pd.concat(dfs)

In [7]:
mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


### Save to CSV

In [8]:
mvps.to_csv("mvps.csv")

### Initialize Selenium WebDriver

In [9]:
driver = webdriver.Chrome(executable_path="/Users/CEdmu/Jupyter/NBA Stats/chromedriver")

  driver = webdriver.Chrome(executable_path="/Users/CEdmu/Jupyter/NBA Stats/chromedriver")


### Iterate through NBA Players

In [10]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    
    # Javascript handling
    driver.execute_script("window.scrollTo(1,10000)")
    sleep(2)

    html = driver.page_source
    
    with open("player/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(html)
    
    # Anti-timeout
    sleep(2)

# Close Webdriver
driver.quit()

### Parse with BeautifulSoup

In [11]:
dfs = []
for year in years:
    with open("player/{}.html".format(year), "r", encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    # Remove extraneous info
    try:
        soup.find('tr', class_="thead").decompose()
    except:
        print(year, 'skipped.')
        continue

    # Populate list of dataframes
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year 
    dfs.append(player)

In [12]:
players = pd.concat(dfs)

In [13]:
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,536,Delon Wright,PG,28,SAC,27,8,25.8,3.9,8.3,...,1.0,2.9,3.9,3.6,1.6,0.4,1.3,1.1,10.0,2021
726,537,Thaddeus Young,PF,32,CHI,68,23,24.3,5.4,9.7,...,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1,2021
727,538,Trae Young,PG,22,ATL,63,63,33.7,7.7,17.7,...,0.6,3.3,3.9,9.4,0.8,0.2,4.1,1.8,25.3,2021
728,539,Cody Zeller,C,28,CHO,48,21,20.9,3.8,6.8,...,2.5,4.4,6.8,1.8,0.6,0.4,1.1,2.5,9.4,2021


### Save to CSV

In [14]:
players.to_csv("players.csv")

### Iterate through NBA teams

In [15]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [16]:
for year in years:
    url = team_stats_url.format(year)

    data = requests.get(url)

    with open("team/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)
        
    # Anti-timeout
    sleep(2)

### Parse with BeautifulSoup

In [17]:
dfs = []
for year in years:
    with open("team/{}.html".format(year), "r", encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    # Remove extraneous info
    try:
        soup.find('tr', class_="thead").decompose()
    except:
        print(year, 'skipped.')
        continue

    # Populate list of dataframes
    team_table = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year 
    team["Team"] = team["Eastern Conference"]
    del team ["Eastern Conference"]
    dfs.append(team)

    # Remove extraneous info
    try:
        soup.find('tr', class_="thead").decompose()
    except:
        print(year, 'skipped.')
        continue

    # Populate list of dataframes
    team_table = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year 
    team["Team"] = team["Western Conference"]
    del team ["Western Conference"]
    dfs.append(team)

In [18]:
teams = pd.concat(dfs)

In [19]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,42,30,.583,—,112.4,110.2,2.26,2021,Dallas Mavericks*
14,38,34,.528,4.0,113.3,112.3,1.07,2021,Memphis Grizzlies*
15,33,39,.458,9.0,111.1,112.8,-1.58,2021,San Antonio Spurs
16,31,41,.431,11.0,114.6,114.9,-0.20,2021,New Orleans Pelicans


### Save to CSV

In [20]:
teams.to_csv("teams.csv")