The purpose of the following file is to scrape advanced NBA stats from https://www.nba.com/stats/players/advanced?PerMode=Totals&Season=2022-23 using a combination of beautiful soup and selenium.

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import psycopg2
import random
from sqlalchemy import create_engine
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from Keyfile import key_fun #Custom file to hide postgreSQL Server information

In [2]:
key = key_fun()

engine = create_engine('postgresql://' + key['username'] + ':' + key['pwd'] + '@localhost:' + 
                       key['port_id'] + '/' + key['database'])

In [3]:
driver = webdriver.Chrome()

In [4]:
url = "https://www.nba.com/stats/players/advanced?PerMode=Totals&Season=2022-23"
driver.get(url)

In [5]:
src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find("table", attrs = {"class" : "Crom_table__p1iZz"})
headers = table.findAll('th')
headerlist = [h.text.strip().replace('\xa0', '_') for h in headers[1:] if not 'RANK' in h.text.strip()][:23]
header_index = {i: header for i, header in enumerate(headerlist)}
header_index

{0: 'PLAYER',
 1: 'TEAM',
 2: 'AGE',
 3: 'GP',
 4: 'W',
 5: 'L',
 6: 'MIN',
 7: 'OFFRTG',
 8: 'DEFRTG',
 9: 'NETRTG',
 10: 'AST%',
 11: 'AST/TO',
 12: 'AST_RATIO',
 13: 'OREB%',
 14: 'DREB%',
 15: 'REB%',
 16: 'TO_RATIO',
 17: 'EFG%',
 18: 'TS%',
 19: 'USG%',
 20: 'PACE',
 21: 'PIE',
 22: 'POSS'}

In [6]:
advanced_data = {header: [] for header in headerlist}
advanced_data['SEASON'] = []

In [7]:
seasons = [str(x) + '-' + str(x+1)[2:] for x in range(1996,2023)]
for season in seasons:
    time.sleep( random.randint(1,4) + random.random() )
    
    if season == '2014-15':
        url = "https://www.nba.com/stats/players/advanced?PerMode=MinutesPer&Season=2014-15&SeasonType=Regular+Season"
    else:
        url = "https://www.nba.com/stats/players/advanced?PerMode=Totals&Season=" + season
    
    driver.get(url)
    
    while True:
        try:
            select = Select(driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
            break
        except:
            time.sleep( random.randint(1,5) + random.random() )
            driver.refresh()
        
    select.select_by_index(0)
    src = driver.page_source
    parser = BeautifulSoup(src, 'lxml')
    table = parser.find("table", attrs = {"class" : "Crom_table__p1iZz"})
    rows = table.findAll('td')
    stats = [row.text.strip() for i, row in enumerate(rows)  if i%24 != 0]

    
    for i, stat in enumerate(stats):
        i = i%23
        advanced_data[ header_index[i] ].append(stat)
        if i == 0:
            advanced_data[ 'SEASON' ].append(season)

In [8]:
df_advanced = pd.DataFrame.from_dict(advanced_data)
df_advanced.head()

Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,...,DREB%,REB%,TO_RATIO,EFG%,TS%,USG%,PACE,PIE,POSS,SEASON
0,A.C. Green,DAL,33,83,23,60,30.1,97.4,104.8,-7.4,...,18.4,13.5,10.3,48.5,52.3,11.8,90.46,9.8,4699,1996-97
1,Aaron McKie,DET,24,83,48,35,19.6,101.5,97.8,3.7,...,11.3,7.1,13.5,46.7,52.4,14.2,90.88,9.5,3084,1996-97
2,Michael Finley,DAL,24,83,23,60,33.6,99.2,104.7,-5.4,...,10.6,6.8,10.5,49.1,53.0,21.8,91.53,10.9,5306,1996-97
3,Antoine Carr,UTA,35,82,64,18,17.8,106.5,103.2,3.3,...,9.8,7.4,10.4,48.3,52.2,20.7,90.43,7.1,2755,1996-97
4,Antoine Walker,BOS,20,82,15,67,36.2,103.0,111.4,-8.4,...,16.7,12.7,11.4,44.5,47.4,24.9,97.85,9.4,6037,1996-97


In [9]:
df_advanced.to_sql('advanced_player_data', con = engine, if_exists = 'replace')

846