In [49]:
import requests
from scraping_utils import years, make_http_req
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [50]:
teams_stats_url_template = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'
for year in years: 
    try:
        file_path = f'../data/yearly_team_data/{year}.html' 
        # Create a new file, if it already exists, raise a FileExistsError
        with open(file_path, 'x', encoding = 'utf-8') as f: 
            url = teams_stats_url_template.format(year)
            response_html = make_http_req(url).text
            f.write(str(response_html))
    except FileExistsError:
        # If the file already exists, we can simply continue
        continue

In [88]:
def clean_standings_tables(page_html : str):
    """
    Extracts the standings table from the given HTML page. Strips the web page of
    unnecessary information, return the relevant standings tables for the page.
    :param page: HTML content of the web page.
    :return: a 2-tuple of beautiful soup objects corresponding to the Eastern & Western conf.
    standings table, respectively.
    """
    soup : BeautifulSoup = BeautifulSoup(page_html, 'html.parser')

    # Extract the specific tables containing the East / West Conference standings 
    # Clean each table to remove unnecessary qualitative data 
    east_standings_table = soup.find(id='divs_standings_E')
    west_standings_table= soup.find(id='divs_standings_W')

    
    # Remove all table rows with the class thead - they contain unnecessary Division name data
    for tr in east_standings_table.find_all('tr', class_='thead'):
        tr.decompose()
    for tr in west_standings_table.find_all('tr', class_='thead'):
        tr.decompose()
    
    return east_standings_table, west_standings_table

In [89]:
dfs = []

for year in years: 
    file_path = f'../data/yearly_team_data/{year}.html' 
    with open(file_path, 'r', encoding='utf-8') as f: 
        page_html = f.read()
        east_standings_table, west_standings_table = clean_standings_tables(page_html)
        
        # Wrap the raw HTML in stringIO before parsing for error-free pandas reading
        east_standings_df = pd.read_html(StringIO(str(east_standings_table)))[0]
        west_standings_df = pd.read_html(StringIO(str(west_standings_table)))[0]

        east_standings_df['Year'] = year
        west_standings_df['Year'] = year
        
        # Add the teams to the corresponding dataframes - the team by default will be under 
        # the conference cols of the DFs

        east_standings_df['Team'] = east_standings_df['Eastern Conference']
        del east_standings_df['Eastern Conference']

        west_standings_df['Team'] = west_standings_df['Western Conference']
        del west_standings_df['Western Conference']
        
        dfs.extend([east_standings_df, west_standings_df])


combined_yearly_team_df : pd.DataFrame= pd.concat(dfs)


In [95]:
combined_yearly_team_df.tail(35)
combined_yearly_team_df.to_csv('../data/team_data_1991-2022.csv')