In [2]:
import pandas as pd
import requests
from selectolax.parser import HTMLParser
from time import sleep
from tqdm import tqdm

In [218]:
def clean_name(player_text):
    """
    Clean and format a player's name extracted from the draft table.

    Parameters:
    player_text (str): The text representing a player's name, which may include roles like (D), (F), or (G).

    Returns:
    str: The cleaned and formatted player name without role indicators.

    This function takes a player's name as input, which may include roles such as (D) for defense, (F) for forward, or (G) for goalie.
    It removes these role indicators, trims any leading or trailing white spaces, and returns the cleaned and formatted player name.

    Example:
    original_name = "John Smith (F)"
    cleaned_name = clean_name(original_name)
    # cleaned_name will be "John Smith"
    """

    # Initialize the cleaned_name with the original text
    cleaned_name = player_text.text().strip().split(' (')[0]
    
    # Iterate through roles_to_remove and remove them from the name
    
    
    return cleaned_name

def scrape_ep_draft(draft_year):
    """
    Scrape data from Elite Prospects NHL Entry Draft page for a given draft year.

    Parameters:
    draft_year (int or str): The year of the NHL Entry Draft to scrape data for.

    Returns:
    pandas.DataFrame: A DataFrame containing draft data including pick number, team, player name, and links.

    This function sends an HTTP GET request to the Elite Prospects website for the specified draft year.
    It then extracts data from the draft table, including pick numbers, team names, player names, and links.
    The data is organized into a DataFrame and returned for further analysis.

    Example:
    def clean_name(name):
        # Implement your cleaning logic here
        return cleaned_name
    """
    # Construct the URL for Elite Prospects NHL Entry Draft page for the given 'draft_year'
    draft_url = f"https://www.eliteprospects.com/draft/nhl-entry-draft/{draft_year}"
    
    # Send an HTTP GET request to the URL
    resp = requests.get(draft_url)
    
    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML content of the response
        html = HTMLParser(resp.text)
        
        # Extract the draft table element
        draft_table = html.css_first('.players.table')
        
        # Extract data from different columns of the draft table
        pick_number = [clean_name(overall_text) for overall_text in draft_table.css('td.overall')]
        pick_team = [clean_name(team_text) for team_text in draft_table.css('td.team')]
        pick_team_link = [team_node.css_first('a').attributes['href'] for team_node in draft_table.css('td.team')]
        player_name = [clean_name(player_text) for player_text in draft_table.css('td.player')]
        player_link = [player_node.css_first('a').attributes['href'] for player_node in draft_table.css('td.player')]
        
        # Create a DataFrame to store the extracted data
        data = pd.DataFrame({'draft_year': draft_year,
                             'pick_number': pick_number,
                             'pick_team': pick_team,
                             'pick_team_link': pick_team_link,
                             'player_name': player_name,
                             'player_link': player_link})
        
        # Return the DataFrame
        return data
    else:
        # If the request was not successful, print the status code
        print(f"Got status code: {resp.status_code}")

def get_players_by_draft_year(draft_year):
    """
    Scrape player data from Elite Prospects by draft year.

    Args:
        draft_year (int): The year of the NHL draft to retrieve player data for.

    Returns:
        pd.DataFrame: A DataFrame containing player names and their corresponding links.
    """
    draft_year_url = f"https://www.eliteprospects.com/search/player?draft={draft_year}"

    player_names_full_draft_class = []
    player_links_full_draft_class = []

    # Make an HTTP request to the Elite Prospects website
    resp = requests.get(draft_year_url)

    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML response
        html = HTMLParser(resp.text)
        
        # Extract information about the number of pages in the draft class
        last_page_link = html.css_first('div.table-pagination').css('span')[1].css_first('a').attributes['href']
        last_page = int(last_page_link[last_page_link.find('page=')+5:])
        
        # Extract player names and links from the first page
        table = html.css_first(".table.players")
        player_names = [clean_name(name) for name in table.css('td.name')]
        player_links = [name.css_first('a').attributes['href'] for name in table.css('td.name')]
        player_names_full_draft_class.extend(player_names)
        player_links_full_draft_class.extend(player_links)

        # Loop through the remaining pages using tqdm for a progress bar
        for page_number in tqdm(range(2, last_page + 1), desc=f"Scraping {draft_year} draft eligibles"):
            page_ending = f"&page={page_number}"
            resp = requests.get(draft_year_url + page_ending)

            # Handle 403 errors by waiting and retrying
            while resp.status_code == 403:
                print(f"Waiting 100 seconds to resend request for page {page_number}")
                sleep(100)
                resp = requests.get(draft_year_url + page_ending)

            if resp.status_code == 200:
                # Parse and extract data from the current page
                html = HTMLParser(resp.text)
                table = html.css_first(".table.players")
                player_names = [clean_name(name) for name in table.css('td.name')]
                player_links = [name.css_first('a').attributes['href'] for name in table.css('td.name')]
                player_names_full_draft_class.extend(player_names)
                player_links_full_draft_class.extend(player_links)
            else:
                print(f"Request Failed. Status Code: {resp.status_code}. Page: {page_number}")

        # Create a DataFrame from the collected data
        return pd.DataFrame({'player_name': player_names_full_draft_class,
                             'player_link': player_links_full_draft_class})
    else:
        print(f"Request Failed. Status Code: {resp.status_code}")

def scrape_skaters_ep_league(season, league):
    """
    Scrape player data from Elite Prospects for a specific season and league.

    Args:
        season (str): The season for which you want to retrieve player data. Example: 2023-2024
        league (str): The league for which you want to retrieve player data. Most popular leagues: 'NHL', 'AHL', 'ECHL', 'NCAA', 'WHL', 'OHL', 'QMJHL', 'USHL', 'KHL', 'SHL', 'LIIGA', 'NL', and 'CZECHIA'

    Returns:
        pd.DataFrame: A DataFrame containing player names and their corresponding links.
    """
    league_url = f"https://www.eliteprospects.com/league/{league}/stats/{season}"

    player_names_full_league = []
    player_links_full_league = []
    player_games_played_full_league = []
    player_goals_full_league = []
    player_assists_full_league = []
    player_points_full_league = []
    player_pim_full_league = []
    player_plus_minus_full_league = []
    # Make an HTTP request to the Elite Prospects website
    resp = requests.get(league_url)

    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML response
        html = HTMLParser(resp.text)
        
        # Extract information about the number of pages in the league
        last_page_link = html.css_first('div.table-pagination').css('span')[1].css_first('a').attributes['href']
        last_page = int(last_page_link[last_page_link.find('page=')+5:])
        
        # Extract player names and links from the first page
        table = html.css_first(".table.player-stats")
        player_names = [clean_name(name) for name in table.css('td.player')]
        player_links = [name.css_first('a').attributes['href'] if name.css_first('a') is not None else None for name in table.css('td.player')]
        player_games_played = [clean_name(games_played) for games_played in table.css('td.gp')]
        player_goals = [clean_name(goals) for goals in table.css('td.g')]
        player_assists = [clean_name(assists) for assists in table.css('td.a')]
        player_points = [clean_name(points) for points in table.css('td.tp')]
        player_pim = [clean_name(pim) for pim in table.css('td.pim')]
        plus_minus = [clean_name(plus_minus) for plus_minus in table.css('td.pm')]

        player_names_full_league.extend(player_names)
        player_links_full_league.extend(player_links)
        player_games_played_full_league.extend(player_games_played)
        player_goals_full_league.extend(player_goals)
        player_assists_full_league.extend(player_assists)
        player_points_full_league.extend(player_points)
        player_pim_full_league.extend(player_pim)
        player_plus_minus_full_league.extend(plus_minus)

        # Loop through the remaining pages using tqdm for a progress bar
        for page_number in tqdm(range(2, last_page + 1), desc=f"Scraping {league}, {season} players"):
            page_ending = f"&page={page_number}"
            resp = requests.get(league_url + page_ending)

            # Handle 403 errors by waiting and retrying
            while resp.status_code == 403:
                print(f"Waiting 100 seconds to resend request for page {page_number}")
                sleep(100)
                resp = requests.get(league_url + page_ending)

            if resp.status_code == 200:
                # Parse and extract data from the current page
                html = HTMLParser(resp.text)
                table = html.css_first(".table.player-stats")
                player_names = [clean_name(name) for name in table.css('td.player')]
                player_links = [name.css_first('a').attributes['href'] if name.css_first('a') is not None else None for name in table.css('td.player')]
                player_games_played = [clean_name(games_played) for games_played in table.css('td.gp')]
                player_goals = [clean_name(goals) for goals in table.css('td.g')]
                player_assists = [clean_name(assists) for assists in table.css('td.a')]
                player_points = [clean_name(points) for points in table.css('td.tp')]
                player_pim = [clean_name(pim) for pim in table.css('td.pim')]
                plus_minus = [clean_name(plus_minus) for plus_minus in table.css('td.pm')]

                player_names_full_league.extend(player_names)
                player_links_full_league.extend(player_links)
                player_games_played_full_league.extend(player_games_played)
                player_goals_full_league.extend(player_goals)
                player_assists_full_league.extend(player_assists)
                player_points_full_league.extend(player_points)
                player_pim_full_league.extend(player_pim)
                player_plus_minus_full_league.extend(plus_minus)

            else:
                print(f"Request Failed. Status Code: {resp.status_code}. Page: {page_number}")
        
        # Create a DataFrame for all full league data
        league_df = pd.DataFrame({'player_name': player_names_full_league,
                                  'player_link': player_links_full_league,
                                  'games_played':player_games_played_full_league,
                                  'goals': player_goals_full_league,
                                  'assists': player_assists_full_league,
                                  'points': player_points_full_league,
                                  'pim': player_pim_full_league,
                                  'plus_minus': player_plus_minus_full_league})
        
        # Return a DataFrame without rows where 'player_link' is None
        return league_df[league_df['player_link'] != None]
    
    else:
        print(f"Request Failed. Status Code: {resp.status_code}")

def scrape_nhl_draft(draft_year):
    """
    Scrape NHL Draft records for a specific year.

    Parameters:
    - draft_year (int): The year of the NHL Draft records to retrieve.

    Returns:
    - df (pandas.DataFrame): A DataFrame containing the NHL Draft records for the specified year.
    """
    # Create the URL for the NHL Draft records based on the input draft year.
    draft_records_url = f"https://records.nhl.com/site/api/draft?cayenneExp=draftYear={draft_year}"
    
    # Send an HTTP GET request to the URL.
    resp = requests.get(draft_records_url)

    # Check if the request was successful (status code 200).
    if resp.status_code == 200:
        # Parse the JSON response.
        json_data = resp.json()
        
        # Convert the JSON data into a pandas DataFrame for easier manipulation.
        df = pd.DataFrame(json_data['data'])
        
        return df
    else:
        # If the request was not successful, print an error message with the status code.
        print(f"Could not fetch data. Error code: {resp.status_code}")

def scrape_game_play_by_play(game_id):
    """
    Scrape play-by-play data for a specific NHL game.

    Parameters:
    - game_id (int): The ID of the NHL game to retrieve play-by-play data for.

    Returns:
    - df (pandas.DataFrame): A DataFrame containing the play-by-play data for the specified game.
    """
    # Define the base URL for the NHL API.
    base_url = "https://api-web.nhle.com/v1/gamecenter"

    # Construct the play-by-play URL.
    play_by_play_url = f"{base_url}/{game_id}/play-by-play"

    try:
        # Send an HTTP GET request to the URL.
        resp = requests.get(play_by_play_url)
        resp.raise_for_status()  # Raise an exception if there's an HTTP error.

        # Parse the JSON response.
        play_by_play_data = resp.json()

        # Extract the 'plays' data and normalize it into a DataFrame.
        df = pd.json_normalize(play_by_play_data['plays'])
        # Clean 
        df = df.rename(columns=lambda x: x.replace('details.', ''))
        df = df.rename(columns=lambda x: x.replace('periodDescriptor.', 'period_'))
        df['homeTeam'] = play_by_play_data['homeTeam']['abbrev']
        df['awayTeam'] = play_by_play_data['homeTeam']['abbrev']
        return df
    except requests.exceptions.RequestException as e:
        # Handle any HTTP request errors.
        print(f"Failed to fetch play-by-play data: {e}")
        return None

def scrape_game_shift_report(game_id):
    """
    Scrape the shift report for a specific NHL game.

    Parameters:
    - game_id (int): The ID of the NHL game to retrieve the shift report for.

    Returns:
    - df (pandas.DataFrame): A DataFrame containing the shift report data for the specified game.
    """
    # Define the base URL for the NHL API.
    base_url = "https://api.nhle.com/stats/rest/en/shiftcharts"

    # Construct the shift report URL.
    shift_url = f"{base_url}?cayenneExp=gameId={game_id}"

    try:
        # Send an HTTP GET request to the URL.
        resp = requests.get(shift_url)
        resp.raise_for_status()  # Raise an exception if there's an HTTP error.

        # Check if the response is successful (status code 200).
        if resp.status_code == 200:
            # Parse the JSON response.
            shift_data = resp.json()

            # Check if 'data' key is present in the response.
            if 'data' in shift_data:
                # Create a DataFrame from the 'data' key.
                df = pd.DataFrame(shift_data['data'])
                # Return shits that have a duration
                return df[~(df['duration'].isna())]
            else:
                print("No 'data' key found in the response.")
        else:
            print("Couldn't retrieve data. Status code:", resp.status_code)
    except requests.exceptions.RequestException as e:
        # Handle any HTTP request errors.
        print(f"Failed to fetch the shift report data: {e}")

    return None

def get_event_on_ice(df_pbp, df_shifts):
    """
    Add 'homeOnIce' and 'awayOnIce' columns to df_pbp with players on ice for each event.

    Parameters:
    - df_pbp (pandas.DataFrame): DataFrame containing play-by-play data.
    - df_shifts (pandas.DataFrame): DataFrame containing player shifts data.

    Returns:
    - df_pbp (pandas.DataFrame): Updated DataFrame with 'homeOnIce' and 'awayOnIce' columns.
    """
    def get_on_ice(row, team):
        """
        Helper function to get players on ice for a specific event and team.

        Parameters:
        - row (pandas.Series): The row representing an event.
        - team (str): The team abbreviation (e.g., 'homeTeam' or 'awayTeam').

        Returns:
        - on_ice (array): Array of player IDs on ice for the event and team.
        """
        on_ice = df_shifts[
            (df_shifts['period'] == row['period']) &
            (df_shifts['startTime'] < row['timeInPeriod']) &
            (df_shifts['endTime'] >= row['timeInPeriod']) &
            (df_shifts['teamAbbrev'] == team)
        ]['playerId'].unique()
        return on_ice

    # Add 'homeOnIce' and 'awayOnIce' columns to df_pbp using the helper function.
    df_pbp['homeOnIce'] = df_pbp.apply(lambda row: get_on_ice(row, row['homeTeam']), axis=1)
    df_pbp['awayOnIce'] = df_pbp.apply(lambda row: get_on_ice(row, row['awayTeam']), axis=1)

    return df_pbp

def find_game_ids(data):
    """
    Recursively find all instances of "gamePk" in a dictionary.

    Parameters:
    - data (dict): The dictionary to search for "gamePk".

    Returns:
    - game_pks (list): A list of all "gamePk" values found in the dictionary.
    """
    game_pks = []

    if isinstance(data, dict):
        for key, value in data.items():
            if key == "gamePk":
                game_pks.append(value)
            elif isinstance(value, (dict, list)):
                game_pks.extend(find_game_ids(value))
    elif isinstance(data, list):
        for item in data:
            game_pks.extend(find_game_ids(item))

    return game_pks

def scrape_season_game_ids(season, game_type="R"):
    """
    Scrape NHL game IDs for a specific season and game type.

    Parameters:
    - season (str): The season in the format 'YYYYYYYY' (e.g., '20232024').
    - game_type (str, optional): The game type ('R' for regular season by default). 'PR' for pre-season and 'P' for playoffs.
    All game types can be found: https://statsapi.web.nhl.com/api/v1/gameTypes.

    Returns:
    - game_ids (list): A list of game IDs for the specified season and game type.
    """
    # Construct the URL for the NHL schedule API.
    season_url = f"https://statsapi.web.nhl.com/api/v1/schedule?season={season}&gameType={game_type}"

    try:
        # Send an HTTP GET request to the URL.
        resp = requests.get(season_url)
        resp.raise_for_status()  # Raise an exception if there's an HTTP error.

        if resp.status_code == 200:
            # Parse the JSON response.
            data = resp.json()

            # Find game IDs using a recursive function (assuming you have the 'find_game_pks' function).
            game_ids = find_game_ids(data)

            return game_ids
        else:
            print(f"Failed to fetch data. Status code: {resp.status_code}")
    except requests.exceptions.RequestException as e:
        # Handle any HTTP request errors.
        print(f"Failed to fetch data: {e}")

    return []



In [258]:
def scrape_plays_season(season):
    game_ids = scrape_season_game_ids(season)
    game_ids = game_ids[:16]
    # Initialize an empty list to store DataFrames
    combined_dfs = []
    column_names = ['hitteePlayerId', 'typeCode', 'blockingPlayerId', 'period', 'winningPlayerId', 'scoringPlayerId', 'awayScore',
                    'awaySOG', 'servedByPlayerId', 'awayOnIce', 'period_number', 'assist2PlayerId', 'homeOnIce', 'committedByPlayerId',
                    'secondaryReason', 'timeInPeriod', 'homeTeamDefendingSide', 'timeRemaining', 'homeScore', 'awayTeam', 'xCoord',
                    'reason', 'period_periodType', 'sortOrder', 'assist1PlayerId', 'situationCode', 'homeTeam', 'typeDescKey',
                    'drawnByPlayerId', 'zoneCode', 'duration', 'descKey', 'shootingPlayerId', 'eventId', 'losingPlayerId',
                    'eventOwnerTeamId', 'yCoord', 'homeSOG', 'shotType', 'goalieInNetId', 'hittingPlayerId', 'playerId']
    for game in tqdm(game_ids):
        pbp_df = scrape_game_play_by_play(game)
        shifts_df = scrape_game_shift_report(game)
        df = get_event_on_ice(pbp_df, shifts_df)
                
        # Append the DataFrame to the list
        combined_dfs.append(df)

    # Concatenate the list of DataFrames into one large DataFrame
    df = pd.concat(combined_dfs, axis=1, columns=column_names)
    
    return df

# Example usage:
season = '20222023'
combined_dataframe = scrape_plays_season(season)

100%|██████████| 16/16 [00:12<00:00,  1.24it/s]


TypeError: concat() got an unexpected keyword argument 'columns'

In [15]:
# Start work on scraping edge data
import asyncio
from websockets.sync.client import connect
import json

def hello():
    with connect("wss://edge.nhl.com/en/skater/8484153") as websocket:
        payload = {"type":"action","event":{"domain":"edge.nhl.com","uri":"/en/skater/8484153","action":"load","data":{"renderFunction":"renderProfileContent","target":"#skatingdistance-section-content","params":{"sectionName":"skatingdistance","units":"imperial","manpower":"all","season":"20232024","stage":"regular","feed":"skatersProfiles","id":"8484153"},"callbackFunction":"runClientFns"}}}
        websocket.send(json.dumps(payload))
        message = websocket.recv()
        print(f"Received: {message}")

hello()

Received: {"type":"html","target":"#skatingdistance-section-content","html":"<div class=\"col-lg-6 col-md-6\">\n    <div class=\"table-responsive\">\n\n        <table class=\"table table-hover\">\n            <thead>\n                <tr>\n                    <td scope=\"col\"></td>\n                    <td scope=\"col\"></td>\n                    <td scope=\"col\" class=\"text-center\">League average<br>by position (F/D)</td>\n                    <td scope=\"col\" class=\"text-center\">Percentile</td>\n                </tr>\n            </thead>\n            <tbody>\n        \n                        <tr>\n                            <td scope=\"row\">Total (mi)</td>\n                                <th class=\"text-center\">6.36</th>\n                            <th class=\"text-center\">11.11</th>\n                            <th class=\"text-center\">Below 50th</td>\n                        </tr>\n                        <tr>\n                            <td scope=\"row\">Average P