In [1]:
import html5lib
import os
import pandas as pd
from bs4 import BeautifulSoup

After scrapping the webstie basketball-reference.com, now we need to put the information countained in each html file in a big data frame that is going to be used later for our machine learning algorithm.

# Retrieving the data from our directory

### 1) Defining some functions

First we create a list that countains the paths to all the boxscores files in our directory.

In [2]:
SCORE_DIR = "data2/Scores"
box_scores = os.listdir(SCORE_DIR)

In [3]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

We then need to define multiple functions:
 - One function __parse_html__ that will return a soup object to us. 
 - That soup object will then be used by multiple functions:
     - __read_line_score__ : This function extracts the name of the teams and the number of points scored by each one
     - __read_stats__ : This function extracts the table that countains the players' individual stats as well as the teams' stats
     - __read_season_info__: This function returns the year when the game was played
     

In [18]:
def parse_html(box_score):
    """
    Parse an HTML file containing sports game data.

    Parameters:
    - box_score (str): The path to the HTML file to be parsed.

    Returns:
    - soup (BeautifulSoup object): A BeautifulSoup object representing the parsed HTML.
    """

    try:
        # Try to open and read the HTML file
        with open(box_score, 'r', encoding='utf-8', errors='ignore') as f:
            html = f.read()

        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(html)

        # Remove elements with specific CSS classes from the parsed HTML
        [s.decompose() for s in soup.select("tr.over_header")]  # Remove rows with "over_header" class
        [s.decompose() for s in soup.select("tr.thead")]       # Remove rows with "thead" class

    except Exception as e:
        # Handle exceptions, typically for file reading errors
        print(f"Error processing {box_score}: {str(e)}")

    return soup


In [19]:
def read_line_score(soup):
    """
    Extract and format line score data (team names and total score) from parsed HTML.

    Parameters:
    - soup (BeautifulSoup object): The BeautifulSoup object representing the parsed HTML.

    Returns:
    - line_score (DataFrame): A Pandas DataFrame containing team names and total score.
    """

    # Extract the line score table from the parsed HTML using its "id" attribute
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]

    # Rename the columns for clarity
    cols = list(line_score.columns)
    cols[0] = "Team"        # Rename the first column to "Team"
    cols[-1] = "Total"      # Rename the last column to "Total"
    line_score.columns = cols

    # Keep only the "Team" and "Total" columns because we are interested in team names and total score
    line_score = line_score[["Team", "Total"]]

    return line_score


In [20]:
def read_stats(soup, team, stat):
    """
    Extract and convert statistical data (basic or advanced) for a specific team from parsed HTML.

    Parameters:
    - soup (BeautifulSoup object): The BeautifulSoup object representing the parsed HTML.
    - team (str): The name of the team for which statistics are being extracted.
    - stat (str): The type of statistics to extract (e.g., "basic" or "advanced").

    Returns:
    - df (DataFrame): A Pandas DataFrame containing the extracted statistical data.
    """

    # Extract the statistical table for the specified team and type (basic or advanced)
    # The "id" attribute is constructed using team and stat variables
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]

    # Convert all data in the DataFrame to numeric values, handling non-numeric values by coercing them to NaN
    df = df.apply(pd.to_numeric, errors="coerce")

    return df


In [21]:
def read_season_info(soup):
    """
    Extract and return the season information from the parsed HTML.

    Parameters:
    - soup (BeautifulSoup object): The BeautifulSoup object representing the parsed HTML.

    Returns:
    - season (str): The year when the game was played.
    """

    # Select the navigation container with the id "bottom_nav_container"
    nav = soup.select("#bottom_nav_container")[0]

    # Extract the href attributes from all the anchor (a) elements within the navigation container
    hrefs = [a["href"] for a in nav.find_all("a")]

    # Get the second element from the hrefs list, split it using underscores, and extract the first part
    season = os.path.basename(hrefs[1]).split("_")[0]

    return season


### 2) Applying the functions and creating the final dataframe

In [None]:
# Initialize variables
base_cols = None  # A list of columns for the summary data
games = []  # A list to store processed game data

# Iterate through each box score in the collection
for box_score in box_scores:

    # Check if the box score file is empty, and if so, skip it
    if os.path.getsize(box_score) == 0:
        continue  # Skip empty files

    # Parse the HTML content of the box score
    soup = parse_html(box_score)

    # Read the line score data from the parsed HTML
    line_score = read_line_score(soup)

    # Extract the list of teams from the line score
    teams = list(line_score["Team"])

    # Initialize a list to store summaries for each team in the game
    summaries = []

    # Iterate through each team in the game
    for team in teams:
        try:
            # Read basic and advanced statistics for the team
            basic = read_stats(soup, team, "basic")
            advanced = read_stats(soup, team, "advanced")

            # Combine total statistics and set lowercase index labels
            totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
            totals.index = totals.index.str.lower()

            # Calculate maximum statistics and set lowercase index labels
            maxes = pd.concat([basic.iloc[:-1, :].max(), advanced.iloc[:-1, :].max()])
            maxes.index = maxes.index.str.lower() + "_max"

            # Combine totals and maximums to create a summary
            summary = pd.concat([totals, maxes])

            # If base_cols is None, initialize it with unique columns and exclude those containing "bpm"
            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep='first'))
                base_cols = [b for b in base_cols if "bpm" not in b]

            # Select only the columns present in base_cols for the summary
            summary = summary[base_cols]

            # Append the summary to the list of summaries for this game
            summaries.append(summary)

        except ValueError:
            # Handle the case where a table is missing (e.g., data not available)
            print(f"Missing table at {len(games)}")
            continue

    # Combine all team summaries horizontally and transpose the result
    summary = pd.concat(summaries, axis=1).T

    # Combine the summary data with the line score data horizontally
    game = pd.concat([summary, line_score], axis=1)

    # Add a "home" column with values [0, 1] to indicate home and away teams
    game["home"] = [0, 1]

    # Reverse the order of rows and reset column names with "_opp" suffix
    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += '_opp'

    # Combine the original game data and the "opposite" game data horizontally
    full_game = pd.concat([game, game_opp], axis=1)

    # Read and assign season information from the parsed HTML
    full_game["season"] = read_season_info(soup)

    # Extract the date from the file name and convert it to a datetime object
    full_game["date"] = os.path.basename(box_score)[:8]
    full_game["date"] = pd.to_datetime(full_game['date'], format="%Y%m%d")

    # Add a "won" column indicating whether the team won the game based on total points
    full_game["won"] = full_game["Total"] > full_game["Total_opp"]

    # Append the processed game data to the list of games
    games.append(full_game)

    # Print progress information for every 100 processed games
    if len(games) % 100 == 0:
        print(f"{len(games)}/{len(box_scores)}")


In [28]:
games_df = pd.concat(games, ignore_index = True)

In [31]:
games_df.to_csv("nba_games.csv")