In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, Comment
import time
from tqdm import tqdm
import re

# Functions

In [2]:
def getSoupFromURL(url, sleep = 3):
    time.sleep(sleep)
    headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"}
    response = requests.get(url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def hClean(text):
    text = re.sub(r'\n', "", text)
    text = re.sub(r"\s*\([^)]*\)", "", text)
    text = re.sub(r"\$.*", "", text)
    if text == "": text = "Placeholder"
    return text

def cleanHeader(header):
    cleaned = [hClean(h) for h in header]
    return cleaned

def cleanSalary(salary):
    if salary == '': return None
    salary = salary.replace('$', '').replace(',', '')
    salary = int(salary) / 1000000
    return salary

def cleanTable(data):
    for col in data.columns:
        if data[col].str.contains('\$').sum() > 0:
            data[col] = data[col].apply(cleanSalary)
    num_cols = ['Yr', 'Age', 'YOE']
    for col in num_cols:
        if col not in data.columns: continue
        data[col] = data[col].replace('', '0')
        data[col] = data[col].astype(np.float32)
    return data

def getTable(table):
    header = [h.text for h in table.find('thead').find_all('th')]
    header = cleanHeader(header)
    data = {h : [] for h in header}
    for row in table.find('tbody').find_all('tr'):
        for i, d in enumerate(row.find_all('td')):
            data[header[i]].append(d.text.strip())
    return pd.DataFrame(data)

def scrapeTables(url):
    s = getSoupFromURL(url)
    tables = []
    for t in s.find_all('table'):
        tbl = getTable(t)
        tbl = cleanTable(tbl)
        tables.append(tbl)
    return tables

In [3]:
url = "https://www.spotrac.com/nba/free-agents/_/year/2025"
tables = scrapeTables(url)

In [4]:
fa_table = tables[0]
unsigned_table = tables[1]

In [5]:
fa_table.to_csv('data/free_agents25.csv', index = False)
unsigned_table.to_csv('data/unsigned25.csv', index = False)