In [2]:
import pandas as pd
import requests
from selectolax.parser import HTMLParser
from time import sleep
from tqdm import tqdm

In [44]:
def clean_name(player_text):
    """
    Clean and format a player's name extracted from the draft table.

    Parameters:
    player_text (str): The text representing a player's name, which may include roles like (D), (F), or (G).

    Returns:
    str: The cleaned and formatted player name without role indicators.

    This function takes a player's name as input, which may include roles such as (D) for defense, (F) for forward, or (G) for goalie.
    It removes these role indicators, trims any leading or trailing white spaces, and returns the cleaned and formatted player name.

    Example:
    original_name = "John Smith (F)"
    cleaned_name = clean_name(original_name)
    # cleaned_name will be "John Smith"
    """

    # Initialize the cleaned_name with the original text
    cleaned_name = player_text.text().strip().split(' (')[0]
    
    # Iterate through roles_to_remove and remove them from the name
    
    
    return cleaned_name

def scrape_ep_draft(draft_year):
    """
    Scrape data from Elite Prospects NHL Entry Draft page for a given draft year.

    Parameters:
    draft_year (int or str): The year of the NHL Entry Draft to scrape data for.

    Returns:
    pandas.DataFrame: A DataFrame containing draft data including pick number, team, player name, and links.

    This function sends an HTTP GET request to the Elite Prospects website for the specified draft year.
    It then extracts data from the draft table, including pick numbers, team names, player names, and links.
    The data is organized into a DataFrame and returned for further analysis.

    Example:
    def clean_name(name):
        # Implement your cleaning logic here
        return cleaned_name
    """
    # Construct the URL for Elite Prospects NHL Entry Draft page for the given 'draft_year'
    draft_url = f"https://www.eliteprospects.com/draft/nhl-entry-draft/{draft_year}"
    
    # Send an HTTP GET request to the URL
    resp = requests.get(draft_url)
    
    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML content of the response
        html = HTMLParser(resp.text)
        
        # Extract the draft table element
        draft_table = html.css_first('.players.table')
        
        # Extract data from different columns of the draft table
        pick_number = [clean_name(overall_text) for overall_text in draft_table.css('td.overall')]
        pick_team = [clean_name(team_text) for team_text in draft_table.css('td.team')]
        pick_team_link = [team_node.css_first('a').attributes['href'] for team_node in draft_table.css('td.team')]
        player_name = [clean_name(player_text) for player_text in draft_table.css('td.player')]
        player_link = [player_node.css_first('a').attributes['href'] for player_node in draft_table.css('td.player')]
        
        # Create a DataFrame to store the extracted data
        data = pd.DataFrame({'draft_year': draft_year,
                             'pick_number': pick_number,
                             'pick_team': pick_team,
                             'pick_team_link': pick_team_link,
                             'player_name': player_name,
                             'player_link': player_link})
        
        # Return the DataFrame
        return data
    else:
        # If the request was not successful, print the status code
        print(f"Got status code: {resp.status_code}")

def get_players_by_draft_year(draft_year):
    """
    Scrape player data from Elite Prospects by draft year.

    Args:
        draft_year (int): The year of the NHL draft to retrieve player data for.

    Returns:
        pd.DataFrame: A DataFrame containing player names and their corresponding links.
    """
    draft_year_url = f"https://www.eliteprospects.com/search/player?draft={draft_year}"

    player_names_full_draft_class = []
    player_links_full_draft_class = []

    # Make an HTTP request to the Elite Prospects website
    resp = requests.get(draft_year_url)

    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML response
        html = HTMLParser(resp.text)
        
        # Extract information about the number of pages in the draft class
        last_page_link = html.css_first('div.table-pagination').css('span')[1].css_first('a').attributes['href']
        last_page = int(last_page_link[last_page_link.find('page=')+5:])
        
        # Extract player names and links from the first page
        table = html.css_first(".table.players")
        player_names = [clean_name(name) for name in table.css('td.name')]
        player_links = [name.css_first('a').attributes['href'] for name in table.css('td.name')]
        player_names_full_draft_class.extend(player_names)
        player_links_full_draft_class.extend(player_links)

        # Loop through the remaining pages using tqdm for a progress bar
        for page_number in tqdm(range(2, last_page + 1), desc=f"Scraping {draft_year} draft eligibles"):
            page_ending = f"&page={page_number}"
            resp = requests.get(draft_year_url + page_ending)

            # Handle 403 errors by waiting and retrying
            while resp.status_code == 403:
                print(f"Waiting 100 seconds to resend request for page {page_number}")
                sleep(100)
                resp = requests.get(draft_year_url + page_ending)

            if resp.status_code == 200:
                # Parse and extract data from the current page
                html = HTMLParser(resp.text)
                table = html.css_first(".table.players")
                player_names = [clean_name(name) for name in table.css('td.name')]
                player_links = [name.css_first('a').attributes['href'] for name in table.css('td.name')]
                player_names_full_draft_class.extend(player_names)
                player_links_full_draft_class.extend(player_links)
            else:
                print(f"Request Failed. Status Code: {resp.status_code}. Page: {page_number}")

        # Create a DataFrame from the collected data
        return pd.DataFrame({'player_name': player_names_full_draft_class,
                             'player_link': player_links_full_draft_class})
    else:
        print(f"Request Failed. Status Code: {resp.status_code}")

def scrape_skaters_ep_league(season, league):
    """
    Scrape player data from Elite Prospects for a specific season and league.

    Args:
        season (str): The season for which you want to retrieve player data.
        league (str): The league for which you want to retrieve player data. Most popular leagues: 'NHL', 'AHL', 'ECHL', 'NCAA', 'WHL', 'OHL', 'QMJHL', 'USHL', 'KHL', 'SHL', 'LIIGA', 'NL', and 'CZECHIA'

    Returns:
        pd.DataFrame: A DataFrame containing player names and their corresponding links.
    """
    league_url = f"https://www.eliteprospects.com/league/{league}/stats/{season}"

    player_names_full_league = []
    player_links_full_league = []
    player_games_played_full_league = []
    player_goals_full_league = []
    player_assists_full_league = []
    player_points_full_league = []
    player_pim_full_league = []
    player_plus_minus_full_league = []
    # Make an HTTP request to the Elite Prospects website
    resp = requests.get(league_url)

    # Check if the request was successful (status code 200)
    if resp.status_code == 200:
        # Parse the HTML response
        html = HTMLParser(resp.text)
        
        # Extract information about the number of pages in the league
        last_page_link = html.css_first('div.table-pagination').css('span')[1].css_first('a').attributes['href']
        last_page = int(last_page_link[last_page_link.find('page=')+5:])
        
        # Extract player names and links from the first page
        table = html.css_first(".table.player-stats")
        player_names = [clean_name(name) for name in table.css('td.player')]
        player_links = [name.css_first('a').attributes['href'] if name.css_first('a') is not None else None for name in table.css('td.player')]
        player_games_played = [clean_name(games_played) for games_played in table.css('td.gp')]
        player_goals = [clean_name(goals) for goals in table.css('td.g')]
        player_assists = [clean_name(assists) for assists in table.css('td.a')]
        player_points = [clean_name(points) for points in table.css('td.tp')]
        player_pim = [clean_name(pim) for pim in table.css('td.pim')]
        plus_minus = [clean_name(plus_minus) for plus_minus in table.css('td.pm')]

        player_names_full_league.extend(player_names)
        player_links_full_league.extend(player_links)
        player_games_played_full_league.extend(player_games_played)
        player_goals_full_league.extend(player_goals)
        player_assists_full_league.extend(player_assists)
        player_points_full_league.extend(player_points)
        player_pim_full_league.extend(player_pim)
        player_plus_minus_full_league.extend(plus_minus)

        # Loop through the remaining pages using tqdm for a progress bar
        for page_number in tqdm(range(2, last_page + 1), desc=f"Scraping {league}, {season} players"):
            page_ending = f"&page={page_number}"
            resp = requests.get(league_url + page_ending)

            # Handle 403 errors by waiting and retrying
            while resp.status_code == 403:
                print(f"Waiting 100 seconds to resend request for page {page_number}")
                sleep(100)
                resp = requests.get(league_url + page_ending)

            if resp.status_code == 200:
                # Parse and extract data from the current page
                html = HTMLParser(resp.text)
                table = html.css_first(".table.player-stats")
                player_names = [clean_name(name) for name in table.css('td.player')]
                player_links = [name.css_first('a').attributes['href'] if name.css_first('a') is not None else None for name in table.css('td.player')]
                player_games_played = [clean_name(games_played) for games_played in table.css('td.gp')]
                player_goals = [clean_name(goals) for goals in table.css('td.g')]
                player_assists = [clean_name(assists) for assists in table.css('td.a')]
                player_points = [clean_name(points) for points in table.css('td.tp')]
                player_pim = [clean_name(pim) for pim in table.css('td.pim')]
                plus_minus = [clean_name(plus_minus) for plus_minus in table.css('td.pm')]

                player_names_full_league.extend(player_names)
                player_links_full_league.extend(player_links)
                player_games_played_full_league.extend(player_games_played)
                player_goals_full_league.extend(player_goals)
                player_assists_full_league.extend(player_assists)
                player_points_full_league.extend(player_points)
                player_pim_full_league.extend(player_pim)
                player_plus_minus_full_league.extend(plus_minus)

            else:
                print(f"Request Failed. Status Code: {resp.status_code}. Page: {page_number}")
        
        # Create a DataFrame for all full league data
        league_df = pd.DataFrame({'player_name': player_names_full_league,
                                  'player_link': player_links_full_league,
                                  'games_played':player_games_played_full_league,
                                  'goals': player_goals_full_league,
                                  'assists': player_assists_full_league,
                                  'points': player_points_full_league,
                                  'pim': player_pim_full_league,
                                  'plus_minus': player_plus_minus_full_league})
        
        # Return a DataFrame without rows where 'player_link' is None
        return league_df[league_df['player_link'] != None]
    
    else:
        print(f"Request Failed. Status Code: {resp.status_code}")


In [46]:
season = "2023-2024"
league = "NCAA"

scrape_skaters_ep_league(season, league)

Scraping NCAA, 2023-2024 players: 100%|██████████| 16/16 [00:10<00:00,  1.54it/s]


Unnamed: 0,player_name,player_link,games_played,goals,assists,points,pim,plus_minus
0,Rutger McGroarty,https://www.eliteprospects.com/player/526095/r...,4,2,7,9,0,2
1,Joey Larson,https://www.eliteprospects.com/player/359529/j...,4,3,4,7,0,6
2,Red Savage,https://www.eliteprospects.com/player/512173/r...,4,3,4,7,2,5
3,Luke Grainger,https://www.eliteprospects.com/player/201540/l...,2,2,5,7,6,3
4,Massimo Rizzo,https://www.eliteprospects.com/player/286939/m...,2,0,7,7,0,4
...,...,...,...,...,...,...,...,...
1848,Marko Reifenberger,https://www.eliteprospects.com/player/312322/m...,2,2,1,3,2,3
1849,Riese Gaber,https://www.eliteprospects.com/player/283845/r...,2,2,1,3,2,3
1850,Andre Gasseau,https://www.eliteprospects.com/player/512184/a...,2,2,1,3,2,0
1851,Carter Wilkie,https://www.eliteprospects.com/player/411159/c...,2,2,1,3,2,3


In [None]:
# Create scraper for draft records
draft_records_url = "https://records.nhl.com/site/api/draft?cayenneExp=draftYear=2016"
# Create scraper for central scouting profile
central_scouting_url = f'https://www.nhl.com/ice/draftprospectdetail.htm?dpid=111442'
central_scouting_url_year = f'https://www.nhl.com/ice/draftprospectbrowse.htm?year=2021'

