In [2]:
# Import Libraries
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

In [3]:
# Defines Where Scores Are Be Stored
SCORE_DIR = "data/scores"

In [4]:
# Read In All Box Scores
box_scores = os.listdir(SCORE_DIR)

In [5]:
# Defines Full Pathway To Box Scores 
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [6]:
# Function To Parse The HTML From A Single Box Score
def parse_html(box_score):
    
    # Takes In The Path To A Box Score File And Opens It
    with open(box_score) as f:
        html = f.read() # Reads The Box Scores HTML

    soup = BeautifulSoup(html) # Parses With Beautiful Soup
    [s.decompose() for s in soup.select("tr.over_header")] # Selects A TR Tag (Table Row Tag) With The Class Over Header To Remove The Table Headers
    [s.decompose() for s in soup.select("tr.thread")] # Selects A TR Tag (Table Row Tag) With The Class Thread To Remove The Headers In The Middle Of The Table
    return soup

In [7]:
# Funtion To Read Box Score Line Score And Who Won  
def read_line_score(soup):
    
    line_score = pd.read_html(StringIO(str(soup)), attrs={'id': 'line_score'})[0] # Reads Table With The ID Line Score
    
    cols = list(line_score.columns) # Converts Column Names To A List
    cols[0] = "team" # First Column Name Is Team
    cols[-1] = "total" # Last Column Name Is Total
    line_score.columns = cols # Assigns Names To The Columns

    # Remove Quarter Scores To Eliminate Inconsistencies Occuring With Games Going Into Overtime
    line_score = line_score[["team", "total"]]
    
    return line_score # Returns Edited Line Score

In [8]:
# Function To Read Stats Tables
def read_stats(soup, team, stat):
    
    df = pd.read_html(StringIO(str(soup)), attrs={'id': f'box-{team}-game-{stat}'}, index_col=0)[0] # Reads Table For A Specific Stat For A Specific Team
    df = df.apply(pd. to_numeric, errors="coerce") # Converts Columns To A Numeric Collumn 
    
    return df # Returns Pandas Dataframe

In [9]:
# Function To Determine The Year When A Game Was Played
def read_season_info(soup):
    
    nav = soup.select("#bottom_nav_container")[0] # Selects The Websites Bottem Nav Container
    hrefs = [a["href"] for a in nav.find_all("a")] # Finds All The A Tags In The Bottem Nav Container And Pulls Out The Link
    season = os.path.basename(hrefs[1]).split("_")[0] # Pulls Out The Year From The First Link In The Bottem Nav Container
    
    return season # Returns The Season The Game Was Played

In [10]:
base_cols = None
games = []

for box_score in box_scores:

    # Runs Function To Parse The Box Scores HTML
    soup = parse_html(box_score)
    
    # Runs Funtion To Read And Edit The Box Scores Line Score
    line_score = read_line_score(soup)
    
    # Gets Teams From Line Score
    teams = list(line_score["team"])
    
    summaries = []
    # Loops Through Each Team
    for team in teams:
    
        basic = read_stats(soup, team, "basic") # Gets Basic Stats From The Read Stats Function For Each Team
        advanced = read_stats(soup, team, "advanced") # Gets Advanced Stats From The Read Stats Function For Each Team
    
        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]]) # Gets The Last Row Of The Basic And Advanced Dataframe With The Team Totals Concatenating Into A Single Pandas Dataframe
        totals.index = totals.index.str.lower() #Changes Totals Index And Writes Column Names In Lowercase
    
        maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()]) # Gets The Max Player In Each Basic And Advanced Stat Looking At Every Row But The Last (Which Contains The Team Totals)
        maxes.index = maxes.index.str.lower() + "_max" #Changes Maxes Index And Writes Column Names In Lowercase
    
        # Puts Maxes And Totals Into The Same Table
        summary = pd.concat([totals,maxes])
    
        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first")) # Removes Any Duplicated Stats That Are In Both The Basic And Advanced Stats
            base_cols = [b for b in base_cols if "bpm" not in b] # Removes BPM Stat For Consistency As It Is Not In All Box Scores
    
        summary = summary[base_cols]
    
        summaries.append(summary)
    summary = pd.concat(summaries, axis=1).T # Adds Summary To List Of Summaries For Both Teams That Played The Game And Turns Dataframe
    
    # Adds Line Score As A Row In The Summary
    game = pd.concat([summary, line_score], axis=1) 

    # First Team Is Always The Away Team So The Home Team Is Indicated With A 1
    game["home"] = [0, 1]
    
    game_opp = game.iloc[::-1].reset_index() # Flips The Dataframe To Keep The Opponents Data
    game_opp.columns += "_opp" # Renames Columns To Include _opp
    
    # Puts The Game And Game Opponent Into A Single Dataframe
    full_game = pd.concat([game, game_opp], axis=1)
    
    # Adds The Season The Game Was Played In To The Dataframe
    full_game["season"] = read_season_info(soup)
    
    full_game["date"] = os.path.basename(box_score)[:8] # Gets The Date The Game Was Played From The First 8 Charcters Of The File Name And Adds It To The Dataframe
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d") # Converts To A Datetime In The Form Year/Month/Day 
    
    # Add The Winner As A Row In The Data Frame (Either True Or False, True If Total Is More Than Opponent Total
    full_game["won"] = full_game["total"] > full_game["total_opp"]

    # Adds Full Game To List Of Games 
    games.append(full_game) 

    # Prints Progress
    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

100 / 12630
200 / 12630
300 / 12630
400 / 12630
500 / 12630
600 / 12630
700 / 12630
800 / 12630
900 / 12630
1000 / 12630
1100 / 12630
1200 / 12630
1300 / 12630
1400 / 12630
1500 / 12630
1600 / 12630
1700 / 12630
1800 / 12630
1900 / 12630
2000 / 12630
2100 / 12630
2200 / 12630
2300 / 12630
2400 / 12630
2500 / 12630
2600 / 12630
2700 / 12630
2800 / 12630
2900 / 12630
3000 / 12630
3100 / 12630
3200 / 12630
3300 / 12630
3400 / 12630
3500 / 12630
3600 / 12630
3700 / 12630
3800 / 12630
3900 / 12630
4000 / 12630
4100 / 12630
4200 / 12630
4300 / 12630
4400 / 12630
4500 / 12630
4600 / 12630
4700 / 12630
4800 / 12630
4900 / 12630
5000 / 12630
5100 / 12630
5200 / 12630
5300 / 12630
5400 / 12630
5500 / 12630
5600 / 12630
5700 / 12630
5800 / 12630
5900 / 12630
6000 / 12630
6100 / 12630
6200 / 12630
6300 / 12630
6400 / 12630
6500 / 12630
6600 / 12630
6700 / 12630
6800 / 12630
6900 / 12630
7000 / 12630
7100 / 12630
7200 / 12630
7300 / 12630
7400 / 12630
7500 / 12630
7600 / 12630
7700 / 12630
7800 / 1

In [12]:
games_df = pd.concat(games, ignore_index=True) # Concatinates All Games Into A Single Dataframe  

In [17]:
[g.shape[1] for g in games if g.shape[1] != 154] # Lists The Games That Are Inconsistenet With The Rest Of The Table Having More Than 154 Columns

[]

In [18]:
games_df.to_csv("nba_games.csv") # Downloads The Dataframe As A CVS File