In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, Comment
import time
from tqdm import tqdm
import re

# Functions

In [2]:
def getSoupFromURL(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def extractTable(soup, table_id):
    tbl = soup.find('table', attrs = {'id' : table_id})
    if tbl is None:
        raise ValueError(f"Could not find table with id {table_id}")
    return tbl

def tableHeader(table_soup):
    h = table_soup.find('thead')
    head_row = h.find_all('tr')[-1]
    labs = {x['data-stat'] : x['aria-label'] for x in head_row.find_all('th')}
    return labs

def tableData(table_soup, use_labels = False):
    header = tableHeader(table_soup)
    data = {k : [] for k in header.keys()}
    data['id'] = []
    body = table_soup.find('tbody')
    rows = body.find_all('tr')
    for row in rows:
        if row.get('class') is not None:
            continue
        cells = row.find_all('th') + row.find_all('td')
        for entry in cells:
            d = entry['data-stat']
            val = entry.text
            data[d].append(val)
            if entry.get('data-append-csv'):
                data['id'].append(entry.get('data-append-csv'))
    data = pd.DataFrame(data)
    if use_labels:
        data = data.rename(columns = header)
    return pd.DataFrame(data)

def scrapePage(url, table_id, use_labels = False, sleep = 3):
    time.sleep(sleep)
    soup = getSoupFromURL(url)
    tbl = extractTable(soup, table_id)
    data = tableData(tbl, use_labels)
    return data

# Cleaning and Aggregating

In [6]:
def cleanSalary(salary):
    if salary == '': return None
    salary = salary.replace('$', '').replace(',', '')
    salary = int(salary) / 1000000
    return salary

def cleanSalaryTable(salary_table):
    non_numerics = ['Rk', 'Player', 'Tm', 'id']
    for col in salary_table.columns:
        if col in non_numerics: continue
        salary_table[col] = salary_table[col].apply(cleanSalary)
    return salary_table

def gatherSalary():
    salary = scrapePage("https://www.basketball-reference.com/contracts/players.html", 'player-contracts', use_labels = True)
    salary = cleanSalaryTable(salary)
    return salary

In [7]:
def cleanAdvanced(adv_table):
    non_numerics = ['name_display', 'team_name_abbr', 'pos', 'awards', 'id']
    for col in adv_table.columns:
        if col in non_numerics: continue
        adv_table[col] = adv_table[col].replace('', None)
        if adv_table[col].str.contains("\.").sum() > 0:
            adv_table[col] = adv_table[col].astype(np.float32)
        else:
            adv_table[col] = adv_table[col].astype(np.int32)
    return adv_table

def gatherAdvanced(start_yr, end_yr):
    df_list = []
    for yr in tqdm(range(start_yr, end_yr + 1)):
        url = f"https://www.basketball-reference.com/leagues/NBA_{yr}_advanced.html"
        raw_df = scrapePage(url, "advanced")
        clean_df = cleanAdvanced(raw_df)
        clean_df['year'] = yr
        df_list.append(clean_df)
    final_df = pd.concat(df_list, axis = 0)
    return final_df

# Collect

In [8]:
advanced = gatherAdvanced(2018, 2025)
salary = gatherSalary()

100%|█████████████████████████████████████████████| 8/8 [00:35<00:00,  4.39s/it]


In [9]:
advanced.to_csv('data/advanced.csv', index = False)
salary.to_csv('data/salary.csv', index = False)