In [2]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import time
import re
import numpy as np

In [3]:
def get_html(url, label, clase, id='all_schedule'):
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    title = soup.find('title').text
    print(title)
    if label == '':
        html = soup.find_all(id=id)

    else:
        html = soup.find_all(label, class_=clase)

    return str(html), title

In [4]:
def scrape_season(season):
    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games.html'
    months = get_html(url, 'div', 'filter')
    soup = bs(months[0], 'lxml')
    months_links = soup.find_all('a', href=True)
    href = [i['href'] for i in months_links ]

    standing_pages = [f'https://www.basketball-reference.com{l}' for l in href]

    return standing_pages

In [5]:
def box_scores(stand_page: str):
    response = requests.get(stand_page)
    html = response.text
    soup = bs(html)
    links = soup.find_all('a')
    hrefs = [l.get('href') for l in links]
    box_scores = [l for l in hrefs]
    box_scores = [f'https://www.basketball-reference.com{l}' for l in box_scores]
    box_scores = list(filter(lambda url: 'boxscores/' in url and url.endswith('html'), box_scores))
    return box_scores

In [6]:
def parse_html(box_score):
    response = requests.get(box_score)
    html = response.content
    html_ = re.sub(r'<!--', '', html.decode())
    soup = bs(html_, 'html.parser')
    return soup

In [7]:
def read_line_score(soup):
    table = pd.read_html(str(soup.select_one('#div_line_score')))
    table_ = table[-1].rename(columns={'Unnamed: 0_level_1': 'team', 'T': 'total'})
    table_.columns = table_.columns.droplevel(0)
    line_score = table_[['team', 'total']]
    return line_score

In [8]:
def read_stats(soup, team, stat) -> pd.DataFrame:
    stats = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    stats.columns = stats.columns.droplevel(0)
    stats = stats.drop(stats[stats['MP'] == 'MP'].index)
    stats.replace(np.nan, 0, inplace=True)
    return stats

In [9]:
def read_season_info(soup):
    nav = soup.select('#bottom_nav_container')[0]
    hrefs = [a['href'] for a in nav.find_all('a') if '.html' in a['href'] and 'teams' in a['href']]
    season = re.search(r'\d+', hrefs[0]).group()
    return season

In [10]:
print(range(1999, 2000))

range(1999, 2000)


In [11]:
stand_pages = [page for season in range(2023, 2024) for page in scrape_season(season)]

2022-23 NBA Schedule | Basketball-Reference.com


In [22]:
stand_pages = scrape_season(1998)

1997-98 NBA Schedule | Basketball-Reference.com


In [23]:
box_scores_list = []
for _ in stand_pages:
    print(_)
    box_scores_list.extend(box_scores(_))
    time.sleep(3)
len(box_scores_list)

https://www.basketball-reference.com/leagues/NBA_1998_games-october.html
https://www.basketball-reference.com/leagues/NBA_1998_games-november.html
https://www.basketball-reference.com/leagues/NBA_1998_games-december.html
https://www.basketball-reference.com/leagues/NBA_1998_games-january.html
https://www.basketball-reference.com/leagues/NBA_1998_games-february.html
https://www.basketball-reference.com/leagues/NBA_1998_games-march.html
https://www.basketball-reference.com/leagues/NBA_1998_games-april.html
https://www.basketball-reference.com/leagues/NBA_1998_games-may.html
https://www.basketball-reference.com/leagues/NBA_1998_games-june.html


1260

In [24]:
base_cols = None
games = []
i = 0
for box_score in box_scores_list:
    start = time.time()
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score['team'])
    i += 1
    summaries = []
    for team in teams:
        basic = read_stats(soup, team, 'basic')
        advanced = read_stats(soup, team, 'advanced')

        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.select_dtypes(include='number').iloc[:-1].max(), advanced.select_dtypes(include='number').iloc[:-1].max(skipna=False)])

        maxes.index = maxes.index.str.lower() + '_max'

        summary = pd.concat([totals, maxes])

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep='first'))
            base_cols = [b for b in base_cols if 'bpm' not in b]

        base_cols = list(set(base_cols))

        try:
            summary = summary[base_cols]
        except KeyError as e:
            pass
        if any(summary.index.duplicated()):
            summary = summary[~summary.index.duplicated()]
        summaries.append(summary)

    summary = pd.concat(summaries, axis=1).T
    game = pd.concat([summary, line_score], axis=1)
    game['home'] = [0, 1]
    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += '_opp'

    full_game = pd.concat([game, game_opp], axis=1)
    full_game['season'] = read_season_info(soup)

    full_game['date'] = (box_score.split('/')[-1].split('.')[0])[:8]
    full_game['date'] = pd.to_datetime(full_game['date'], format='%Y%m%d')
    full_game['won'] = full_game['total'] > full_game['total_opp']

    games.append(full_game)

    if len(games) % 6 == 0:
        print(teams, end=f' {str(i).zfill(4)}/{len(box_scores_list)}\n')
        duration = time.time() - start
        if duration < 2:
            time.sleep(2)
    else:
        print(teams, end=' ')
        duration = time.time() - start
        if duration < 2:
            time.sleep(2)

['CHI', 'BOS'] ['NYK', 'CHH'] ['SAS', 'DEN'] ['WAS', 'DET'] ['CLE', 'HOU'] ['UTA', 'LAL'] 00006/1260
['TOR', 'MIA'] ['GSW', 'MIN'] ['IND', 'NJN'] ['ATL', 'ORL'] ['MIL', 'PHI'] ['LAC', 'PHO'] 00012/1260
['SEA', 'POR'] ['DAL', 'VAN'] ['TOR', 'ATL'] ['PHI', 'CHI'] ['GSW', 'IND'] ['POR', 'LAC'] 00018/1260
['NJN', 'MIL'] ['CHH', 'MIN'] ['CLE', 'SAS'] ['DAL', 'SEA'] ['DEN', 'UTA'] ['SAC', 'VAN'] 00024/1260
['MIA', 'WAS'] ['ORL', 'BOS'] ['DET', 'NYK'] ['HOU', 'SAC'] ['SAS', 'CHI'] ['CHH', 'MIA'] 00030/1260
['WAS', 'UTA'] ['DET', 'ATL'] ['IND', 'CLE'] ['VAN', 'DAL'] ['WAS', 'DEN'] ['ORL', 'MIL'] 00036/1260
['BOS', 'NYK'] ['UTA', 'PHO'] ['MIN', 'POR'] ['LAL', 'SAC'] ['HOU', 'SEA'] ['GSW', 'TOR'] 00042/1260
['MIA', 'BOS'] ['DAL', 'CHH'] ['ORL', 'CHI'] ['IND', 'DET'] ['HOU', 'LAC'] ['GSW', 'NJN'] 00048/1260
['ATL', 'PHI'] ['VAN', 'SAS'] ['PHI', 'MIL'] ['NYK', 'PHO'] ['SEA', 'TOR'] ['CHI', 'ATL'] 00054/1260
['CLE', 'BOS'] ['UTA', 'DEN'] ['ORL', 'DET'] ['POR', 'HOU'] ['SEA', 'IND'] ['NYK', 'LAL'] 0

In [None]:
games[0]

In [None]:
games[1]

In [25]:
games_df = pd.concat(games, ignore_index=True)

In [26]:
games_df.to_csv('NBA_data/nba_games_1998.csv')

In [19]:
df_99 = pd.read_csv('NBA_data//nba_games_1999.csv', index_col=0)
df_00 = pd.read_csv('NBA_data//nba_games_2000.csv', index_col=0)
df_01 = pd.read_csv('NBA_data//nba_games_2001.csv', index_col=0)
df_02 = pd.read_csv('NBA_data//nba_games_2002.csv', index_col=0)
df_03 = pd.read_csv('NBA_data//nba_games_2003.csv', index_col=0)
df_04 = pd.read_csv('NBA_data//nba_games_2004.csv', index_col=0)
df_05 = pd.read_csv('NBA_data//nba_games_2005.csv', index_col=0)
df_06 = pd.read_csv('NBA_data//nba_games_2006.csv', index_col=0)
df_07 = pd.read_csv('NBA_data//nba_games_2007.csv', index_col=0)
df_08 = pd.read_csv('NBA_data//nba_games_2008.csv', index_col=0)
df_09 = pd.read_csv('NBA_data//nba_games_2009.csv', index_col=0)
df_10 = pd.read_csv('NBA_data//nba_games_2010.csv', index_col=0)
df_11 = pd.read_csv('NBA_data//nba_games_2011.csv', index_col=0)
df_12 = pd.read_csv('NBA_data//nba_games_2012.csv', index_col=0)
df_13 = pd.read_csv('NBA_data//nba_games_2013.csv', index_col=0)
df_14 = pd.read_csv('NBA_data//nba_games_2014.csv', index_col=0)
df_15 = pd.read_csv('NBA_data//nba_games_2015.csv', index_col=0)
df_16 = pd.read_csv('NBA_data//nba_games_2016.csv', index_col=0)
df_17 = pd.read_csv('NBA_data//nba_games_2017.csv', index_col=0)
df_18 = pd.read_csv('NBA_data//nba_games_2018.csv', index_col=0)
df_19 = pd.read_csv('NBA_data//nba_games_2019.csv', index_col=0)
df_20 = pd.read_csv('NBA_data//nba_games_2020.csv', index_col=0)
df_21 = pd.read_csv('NBA_data//nba_games_2021.csv', index_col=0)
df_22 = pd.read_csv('NBA_data//nba_games_2022.csv', index_col=0)

In [20]:
df = pd.concat([df_99, df_00, df_01, df_02, df_03, df_04, df_05, df_06, df_07, df_08, df_09, df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22], ignore_index=True)

In [21]:
df.to_csv('NBA_data/nba_games_1999_2022.csv')