# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import re

# Scraping Functions

In [2]:
def getPerPossURL(season):
    return f"https://www.basketball-reference.com/leagues/NBA_{season}_per_poss.html"

def getAdvancedURL(season):
    return f"https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html"

def getDraftURL(season):
    return f"https://www.basketball-reference.com/draft/NBA_{season}.html"

def getSoupFromURL(url):
    """
    Get soup from given url
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def getHeaders(table):
    lst = table.find('thead').find_all('th')
    return [x['data-stat'] for x in lst]

def getData(table):
    body = table.find('tbody')
    data_rows = body.find_all('tr', attrs = {'class' : 'full_table'})
    labs = [x['data-stat'] for x in data_rows[0].find_all('td')]
    labs.append('player_id')
    dct = {l : [] for l in labs}
    for row in data_rows:
        d = row.find_all('td')
        for entry in d:
            key = entry['data-stat']
            val = entry.text
            dct[key].append(val)
            if key == 'player':
                dct['player_id'].append(entry['data-append-csv'])
    return dct

def getDataDraft(table):
    body = table.find('tbody')
    data_rows = body.find_all('tr')
    labs = [x['data-stat'] for x in data_rows[0].find_all('td')]
    labs.append('player_id')
    dct = {l : [] for l in labs}
    for row in data_rows:
        d = row.find_all('td')
        if len(d) == 1:
            break
        for entry in d:
            key = entry['data-stat']
            val = entry.text
            dct[key].append(val)
            if key == 'player':
                ref = entry.find('a')['href']
                pid = re.match(r"/players/[a-z]/([a-z0-9]*)\.html", ref).group(1)
                dct['player_id'].append(pid)
    return dct

def cleanTable(table):
    char_cols = ['player', 'pos', 'team_id', 'player_id', 'college_name']
    int_cols = ['g', 'gs', 'mp', 'season', 'pick_overall']
    for col in table.columns:
        if col in char_cols:
            continue
        elif col in int_cols:
            table[col] = table[col].replace('', '0')
            table[col] = table[col].astype(int)
        else:
            table[col] = table[col].replace('', '0')
            table[col] = table[col].astype(float)
    return table

In [3]:
def scrapeTable(season, url_funct, data_funct):
    url = url_funct(season)
    s = getSoupFromURL(url)
    table = s.find('table')
    data = data_funct(table)
    for key in ['', 'DUMMY']:
        if key in data.keys():
            del data[key]
    df = pd.DataFrame(data)
    df['season'] = season
    df = cleanTable(df)
    return df

def scrapeManyTables(start, end, url_funct, data_funct):
    df_list = []
    for season in tqdm(range(start, end + 1)):
        df_list.append(scrapeTable(season, url_funct, data_funct))
        time.sleep(3)
    big_df = pd.concat(df_list)
    return big_df

# Actual Scraping

In [4]:
perposs_df = scrapeManyTables(1990, 2024, getPerPossURL, getData)
advanced_df = scrapeManyTables(1990, 2024, getAdvancedURL, getData)

100%|███████████████████████████████████████████| 35/35 [02:15<00:00,  3.86s/it]
100%|███████████████████████████████████████████| 35/35 [02:13<00:00,  3.81s/it]


In [5]:
draft_df = scrapeManyTables(1990, 2023, getDraftURL, getDataDraft)

100%|███████████████████████████████████████████| 34/34 [01:52<00:00,  3.31s/it]


In [6]:
perposs_df.head()

Unnamed: 0,player,pos,age,team_id,g,gs,mp,fg_per_poss,fga_per_poss,fg_pct,...,ast_per_poss,stl_per_poss,blk_per_poss,tov_per_poss,pf_per_poss,pts_per_poss,off_rtg,def_rtg,player_id,season
0,Mark Acres,C,27.0,ORL,80,50,1691,3.8,7.8,0.484,...,1.8,1.0,0.7,1.9,6.8,9.9,110.0,114.0,acresma01,1990
1,Michael Adams,PG,27.0,DEN,79,74,2690,6.7,16.7,0.402,...,8.4,2.0,0.1,2.4,2.3,20.7,114.0,109.0,adamsmi01,1990
2,Mark Aguirre,SF,30.0,DET,78,40,2005,11.1,22.8,0.488,...,3.7,0.9,0.5,3.1,5.1,27.9,111.0,106.0,aguirma01,1990
3,Danny Ainge,PG,30.0,SAC,75,68,2727,9.2,21.0,0.438,...,8.2,2.1,0.3,3.4,4.3,24.4,107.0,110.0,aingeda01,1990
4,Mark Alarie,PF,26.0,WSB,82,10,1893,9.5,20.0,0.473,...,3.6,1.5,1.0,2.6,5.6,21.9,106.0,110.0,alarima01,1990


In [7]:
advanced_df.head()

Unnamed: 0,player,pos,age,team_id,g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,...,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,player_id,season
0,Mark Acres,C,27.0,ORL,80,1691,8.3,0.536,0.014,0.421,...,1.1,0.6,1.6,0.047,-3.1,-0.6,-3.7,-0.7,acresma01,1990
1,Michael Adams,PG,27.0,DEN,79,2690,15.4,0.542,0.437,0.317,...,4.4,2.5,6.9,0.124,1.8,-0.1,1.8,2.6,adamsmi01,1990
2,Mark Aguirre,SF,30.0,DET,78,2005,15.8,0.544,0.104,0.283,...,3.1,2.5,5.7,0.136,1.0,0.0,1.0,1.5,aguirma01,1990
3,Danny Ainge,PG,30.0,SAC,75,2727,16.1,0.528,0.25,0.231,...,2.7,2.1,4.8,0.085,1.2,-0.1,1.1,2.1,aingeda01,1990
4,Mark Alarie,PF,26.0,WSB,82,1893,14.1,0.51,0.062,0.169,...,1.5,1.6,3.1,0.079,-0.6,-0.7,-1.3,0.3,alarima01,1990


In [8]:
draft_df.head()

Unnamed: 0,pick_overall,team_id,player,college_name,seasons,g,mp,pts,trb,ast,...,mp_per_g,pts_per_g,trb_per_g,ast_per_g,ws,ws_per_48,bpm,vorp,player_id,season
0,1,NJN,Derrick Coleman,Syracuse,15.0,781,25903,12884.0,7232.0,1985.0,...,33.2,16.5,9.3,2.5,64.3,0.119,1.4,22.3,colemde01,1990
1,2,SEA,Gary Payton,Oregon State,17.0,1335,47117,21813.0,5269.0,8966.0,...,35.3,16.3,3.9,6.7,145.5,0.148,3.3,62.5,paytoga01,1990
2,3,DEN,Mahmoud Abdul-Rauf,LSU,9.0,586,15628,8553.0,1087.0,2079.0,...,26.7,14.6,1.9,3.5,25.2,0.077,-0.8,4.5,abdulma02,1990
3,4,ORL,Dennis Scott,Georgia Tech,10.0,629,17983,8094.0,1774.0,1296.0,...,28.6,12.9,2.8,2.1,33.4,0.089,0.2,9.9,scottde01,1990
4,5,CHH,Kendall Gill,Illinois,15.0,966,29481,12914.0,4002.0,2945.0,...,30.5,13.4,4.1,3.0,47.8,0.078,0.1,15.8,gillke01,1990


In [9]:
print(perposs_df.shape)
print(advanced_df.shape)
print(draft_df.shape)

(16201, 32)
(16201, 28)
(1867, 23)


In [10]:
perposs_df.to_csv('data/perposs.csv', index = False)
advanced_df.to_csv('data/advanced.csv', index = False)
draft_df.to_csv('data/draft.csv', index = False)