In [2]:
import pandas as pd
import os
from bs4 import BeautifulSoup
from io import StringIO
from lists import team_mapping

In [3]:
SCORES_DIR = 'data/scores'
box_scores = os.listdir(SCORES_DIR)

In [36]:
SCORES_DIR = 'data/scores'
box_scores = os.listdir(SCORES_DIR)

def parse_html(box_score):
    file_path = os.path.join(SCORES_DIR, box_score)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
    except UnicodeDecodeError:
        print(f"Skipping file {file_path} due to encoding error.")
        return None
    
    soup = BeautifulSoup(html, 'html.parser')
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup


def lsc(soup):
    ts = []
    scs = []
    teams_table = soup.find(class_ = 'scorebox')
    if teams_table:
        sub_table = teams_table.find_all('strong')
        for teams in sub_table:
            team = teams.find_all('a')
            for a in team:
                cleaned_team = a.text.strip()
                abbrev = team_mapping.get(cleaned_team)                  
                ts.append(abbrev)
                break
    scores_table = soup.find_all(class_ = 'score')
    for scores in scores_table:
        score = scores.text.strip()
        scs.append(score)

    df = pd.DataFrame(list(zip(ts, scs)), columns=['team', 'score'])
    return df

def read_stats(soup, team, stat):
    html_str = str(soup)
    html_io = StringIO(html_str)
    df = pd.read_html(html_io, attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df    

def season(soup):
    seas = []
    teams_table = soup.find(class_ = 'scorebox')
    if teams_table:
        sub_table = teams_table.find_all('strong')
        for teams in sub_table:
            team = teams.find_all('a')
            for a in team:
                href = a.get('href').split('/')[3]
                year = href.split('.')[0]
                seas.append(year)
        return seas

In [37]:
one = parse_html(box_scores[3])
two = lsc(one)
team = two['team'][0]
team
season(one)

['2024', '2024']

In [38]:
games = []
base_cols = None

for box_score in box_scores:
    soup = parse_html(box_score)
    if soup is None:
        continue 

    line_score = lsc(soup)
    if line_score.empty:
        print(f"No line score data for file {box_score}")
        continue

    teams = list(line_score['team'])
    summaries = []
    
    for team in teams:
        print(team)
        try:
            basic_stats = read_stats(soup, team, 'basic')
            advanced_stats = read_stats(soup, team, 'advanced')
            
            totals = pd.concat([basic_stats.iloc[-1, :], advanced_stats.iloc[-1, :]])
            totals.index = totals.index.str.lower()
                
            maxes = pd.concat([basic_stats.max(), advanced_stats.max()])
            maxes.index = maxes.index.str.lower() + "_MAX"

            summary = pd.concat([totals, maxes])
            print(summary)

            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep="first"))
                base_cols = [b for b in base_cols if "bpm" not in b]
            summary = summary[base_cols] 
            summaries.append(summary)
        except Exception as e:
            print(f"Error processing team {team}: {e}")

    if summaries:  # Check if summaries is not empty
        summary_df = pd.concat(summaries, axis=1).T
        game = pd.concat([summary_df, line_score], axis=1)
        game["home"] = [0, 1]
        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += "_OPP"

        full_game = pd.concat([game, game_opp], axis=1)
        full_game["date"] = os.path.basename(box_score)[:8]
        full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
        full_game['season'] = season(soup)
        
        full_game["won"] = full_game["score"] > full_game["score_OPP"]
        games.append(full_game)
    else:
        print(f"No valid data for file {box_score}")


ORL
mp          240.000
fg           27.000
fga          86.000
fg%           0.314
3p            6.000
             ...   
tov%_MAX     33.900
usg%_MAX    100.000
ortg_MAX    106.000
drtg_MAX    106.000
bpm_MAX      14.400
Length: 74, dtype: float64
MEM
mp          240.000
fg           34.000
fga          99.000
fg%           0.343
3p           11.000
             ...   
tov%_MAX     16.800
usg%_MAX    100.000
ortg_MAX    157.000
drtg_MAX     90.000
bpm_MAX      14.900
Length: 74, dtype: float64
BOS
mp          240.000
fg           43.000
fga          83.000
fg%           0.518
3p           15.000
             ...   
tov%_MAX     25.800
usg%_MAX    100.000
ortg_MAX    236.000
drtg_MAX    122.000
bpm_MAX      15.900
Length: 74, dtype: float64
BRK
mp          240.000
fg           39.000
fga          79.000
fg%           0.494
3p           15.000
             ...   
tov%_MAX     50.000
usg%_MAX    100.000
ortg_MAX    209.000
drtg_MAX    129.000
bpm_MAX      24.800
Length: 74, dtype: floa

In [39]:
games_df = pd.concat(games, ignore_index=True)
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_MAX_OPP,usg%_MAX_OPP,ortg_MAX_OPP,drtg_MAX_OPP,team_OPP,score_OPP,home_OPP,date,season,won
0,240.0,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,...,16.8,100.0,157.0,90.0,MEM,92,1,2021-04-30,2021,False
1,240.0,240.0,34.0,99.0,0.343,11.0,39.0,0.282,13.0,16.0,...,33.9,100.0,106.0,106.0,ORL,75,0,2021-04-30,2021,True
2,240.0,240.0,43.0,83.0,0.518,15.0,37.0,0.405,17.0,23.0,...,50.0,100.0,209.0,129.0,BRK,110,1,2024-02-13,2024,True
3,240.0,240.0,39.0,79.0,0.494,15.0,33.0,0.455,17.0,26.0,...,25.8,100.0,236.0,122.0,BOS,118,0,2024-02-13,2024,False
4,240.0,240.0,41.0,84.0,0.488,12.0,38.0,0.316,23.0,25.0,...,30.8,100.0,165.0,122.0,ATL,109,1,2023-11-11,2024,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9335,240.0,240.0,43.0,92.0,0.467,12.0,34.0,0.353,12.0,19.0,...,51.5,100.0,200.0,112.0,MIN,117,0,2023-11-10,2024,False
9336,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,100.0,150.0,126.0,MIA,106,1,2020-09-19,2020,True
9337,240.0,240.0,33.0,85.0,0.388,12.0,44.0,0.273,28.0,34.0,...,51.5,100.0,141.0,114.0,BOS,117,0,2020-09-19,2020,False
9338,240.0,240.0,42.0,82.0,0.512,12.0,27.0,0.444,12.0,19.0,...,53.2,100.0,146.0,126.0,TOR,121,1,2023-11-24,2024,False


In [40]:
games_df.drop(['gmsc', '+/-', 'gmsc_OPP', '+/-_OPP'], axis=1, inplace=True)

In [41]:
games_df.to_csv("nba_games.csv")