In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

# Advanced Stats Scraping

In [None]:
def getURL(year):
    return f"https://www.basketball-reference.com/leagues/NBA_{year}.html"

def getSoup(year):
    url = getURL(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def headers(table):
    th = table.find('thead').find_all('tr')[1]
    th = th.find_all('th')
    labels = []
    for i in range(len(th)):
        if th[i].text in labels:
            labels.append(f"Opp {th[i].text}")
        else:
            labels.append(th[i].text)
    return labels

def tableData(table):
    labels = headers(table)
    data = []
    for lab in labels:
        data.append([])
    rows = table.find('tbody').find_all('tr')
    for r in rows:
        if r.get('id') is not None:
            continue
        tm = r.find('th').text
        data[0].append(tm)
        td = r.find_all('td')
        for i in range(len(td)):
            data[i + 1].append(td[i].text)
    dataDict = {}
    for i in range(len(labels)):
        dataDict[labels[i]] = data[i]
    return pd.DataFrame(dataDict)

def removeStar(text):
    return text.replace("*", "")

def cleanCols(table):
    dropCols = ['Rk', '\xa0', 'Opp \xa0', 'Arena', 'Attend.', 'Attend./G']
    table = table.drop(columns = dropCols)
    notNumeric = ['Team']
    for c in table.columns.tolist():
        if c in notNumeric:
            continue
        table[c] = pd.to_numeric(table[c])
    table['Team'] = table['Team'].apply(removeStar)
    return table

def advancedTable(year):
    soup = getSoup(year)
    table = soup.find('table', {'id' : 'advanced-team'})
    df = tableData(table)
    df = cleanCols(df)
    return df

def gatherStats(start, end):
    df_list = []
    for i in tqdm(range(start, end + 1)):
        table = advancedTable(i)
        time.sleep(2)
        table['Season'] = i
        df_list.append(table)
    big_df = pd.concat(df_list)
    seas = big_df.pop('Season')
    big_df.insert(1, 'Season', seas)
    return big_df

In [None]:
advanced_df = gatherStats(1980, 2023)

In [None]:
advanced_df.to_csv('DATA/advanced.csv', index = False)

# All-NBA Players Scraping

In [None]:
def getURL(year):
    return f"https://www.basketball-reference.com/awards/awards_{year}.html"

def getSoup(year):
    url = getURL(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def headers(table):
    th = table.find('thead').find_all('tr')[1]
    th = th.find_all('th')
    labels = []
    for i in range(len(th)):
        labels.append(th[i].text)
    return labels

def tableData(table):
    labels = headers(table)
    data = []
    for lab in labels:
        data.append([])
    rows = table.find('tbody').find_all('tr')
    for r in rows:
        if r.get('id') is not None:
            continue
        tm = r.find('th').text
        data[0].append(tm)
        td = r.find_all('td')
        for i in range(len(td)):
            data[i + 1].append(td[i].text)
    dataDict = {}
    for i in range(len(labels)):
        dataDict[labels[i]] = data[i]
    return pd.DataFrame(dataDict)

def cleanCols(table):
    notNumeric = ['# Tm', 'Pos', 'Player', 'Tm']
    for c in table.columns.tolist():
        if c in notNumeric:
            continue
        table[c] = pd.to_numeric(table[c])
    table['3P%'] = table['3P%'].fillna(0)
    return table

def allNBATable(year):
    soup = getSoup(year)
    table = soup.find('table', {'id' : 'leading_all_nba'})
    df = tableData(table)
    df = cleanCols(df)
    return df

def gatherStats(start, end):
    df_list = []
    for i in tqdm(range(start, end + 1)):
        table = allNBATable(i)
        time.sleep(2)
        table['Season'] = i
        df_list.append(table)
    big_df = pd.concat(df_list)
    seas = big_df.pop('Season')
    big_df.insert(3, 'Season', seas)
    return big_df

In [None]:
allnba = gatherStats(1980, 2023)

In [None]:
allnba.to_csv('DATA/allnba.csv', index = False)

# Playoff Scraping

In [None]:
def getURL(year):
    return f"https://www.basketball-reference.com/playoffs/NBA_{year}.html"

def getSoup(year):
    url = getURL(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def deleteConf(text):
    text = text.replace("Eastern ", "")
    text = text.replace("Western ", "")
    return text

def resultsToDF(results, year):
    data = {'Team' : [],
            'Season' : [],
            'Result' : []}
    for team, res in results.items():
        data['Team'].append(team)
        data['Result'].append(res)
        data['Season'].append(year)
    return pd.DataFrame(data)

def playoffResults(year):
    s = getSoup(year)
    t = s.find('table', {'id' : 'all_playoffs'})
    b = t.find('tbody')
    rows = b.find_all('tr')
    results = {}
    for i in range(len(rows)):
        if rows[i].find('strong') is not None:
            res = rows[i].find_all('td')[1]
            a = res.find_all('a')
            if rows[i].find('strong').text == 'Finals':
                winner = a[0].text
                loser = a[1].text
                results[winner] = 'Won Finals'
                results[loser] = 'Lost Finals'
            else:
                loser = a[1].text
                results[loser] = deleteConf(rows[i].find('strong').text)
    return resultsToDF(results, year)

def allPlayoffs(start, end):
    df_list = []
    for i in tqdm(range(start, end + 1)):
        res_df = playoffResults(i)
        time.sleep(2)
        df_list.append(res_df)
    return pd.concat(df_list)

In [None]:
playoff_df = allPlayoffs(1980, 2023)

In [None]:
playoff_df.to_csv('DATA/playoff_results.csv', index = False)