In [1]:
from bs4 import BeautifulSoup
import requests

In [61]:
def get_page(url):
    return requests.get(url)

test_url = "https://www.hockey-reference.com/players/a/acciano01.html"

In [62]:
def make_soup(page):
    return BeautifulSoup(page.text, 'html')

page = make_soup(get_page(test_url))

In [83]:
import re
def get_years_active(soup):
    #print(soup)
    years = soup.find('table', id = "stats_basic_plus_nhl")
    #print(years)
    rows = years.find_all(attrs={"data-stat":"season"})
    seasons = []
    for season in rows:
        text = season.text
        # we're just going to use regex to extract the season
        pattern = r'[0-9]+-[0-9]+$'
        #print(text)
        result = re.findall(pattern, text)
        #print(result)
        if len(result) > 0:
            # IF WE SUCCESSFULLY FOUND A YEAR THEN RECORD THE LATER PART OF THE YEAR
            # NOTE THIS DOESNT WORK FOR THE SEASONS THAT CROSS ANY CENTENNIAL YEAR!!!
            to_append = result[0][:2] + result[0][-2:]
            #print(to_append)
            if len(seasons)>0:
                if to_append != seasons[-1]:
                    seasons.append(to_append)
            else:
                seasons.append(to_append)

    # if we've played more than one seasons, get the first and last        
    if len(seasons) > 1:
        return seasons[0]+'-'+seasons[-1]
    else:
        return seasons[0]
    

    #first_year = first_row.find('th', data_stat_ = "season").text[-4:]
    #return first_year


print(get_years_active(page))

2016-2024


We retrieve the name, position, and shooting side of each player to add to our database of active players. We also combine this with their active years.

In [90]:
import re
def get_position_shoots(soup):
    
    info = soup.find(id="info")
    #print(info)
    values = info.find_all('p')
    #print(values)
    pattern_position = r"Position:\s([a-z|A-Z]*)"
    pattern_shoots = r"Shoots:\s([a-z|A-Z]*)"
    pattern_name = r"Name:\s([a-z|A-Z]*\s[a-z|A-Z]*)"

    info_dict = {
        "name" : "Not Found",
        "position" : "Not Found",
        "shoots" : "Not Found"
    }    

    for item in values:
        text = item.text
        #print(text)
        matches_name = re.findall(pattern_name, text)

        matches_position = re.findall(pattern_position, text)
        #print(matches_position)
        
        matches_shoots = re.findall(pattern_shoots, text)
        #print(matches_shoots)

        if len(matches_name) > 0:
            info_dict["name"] = matches_name[0]
        if len(matches_position) > 0:
            info_dict["position"] = matches_position[0]
        if len(matches_shoots) > 0:
            info_dict["shoots"] = matches_shoots[0]

    return info_dict

print(get_position_shoots(page))

{'name': 'Jake Allen', 'position': 'G', 'shoots': 'Not Found'}


# Putting it all together
We now have some functions to pull the name, active years, position and handedness. We're going to iterate through our existing dataframe and add these values.

In [78]:
import pandas as pd
import numpy as np
df = pd.read_csv('active_players.csv', sep='\t')

def create_columns(dataframe):
    dataframe['position'] = "Not Found"
    dataframe['shoots'] = "Not Found"
    dataframe['years_active'] = "Not Found"

create_columns(df)

print(df.dtypes)

    




name            object
link            object
position        object
shoots          object
years_active    object
dtype: object


In [97]:
import time as t

for row in df.itertuples(name='Pandas'):
    # Have to wait 6 seconds between calls to different webpages to not get banned, so we will go 7.5 seconds + random noise between calls
    start = t.time()
    url = row.link
    page = make_soup(get_page(url))
    active_years = get_years_active(page)
    info_dict = get_position_shoots(page)
    print(row.name, active_years, info_dict['position'], info_dict['shoots'])
    df.loc[df['link'] == row.link, ['years_active', 'shoots', 'position']] = np.array([active_years, info_dict['shoots'], info_dict['position']])
    #df.loc[df['name'] == row.name, 'shoots'] = info_dict['shoots']
    #df.loc[df['name'] == row.name, 'position'] = info_dict['position']
    end = t.time()
    t.sleep(max(0, (10)-(end-start))) # sleeps for the maximum of 0 or 5 - (time elapsed) if 0 > 5 - time elapsed this means time elapsed > 5 so we're good to go for another round

df.to_csv("active_players_additional.csv", sep="\t", encoding='utf-8', index=False)

Noel Acciari 2016-2024 C Right
Calen Addison 2021-2024 D Right
Egor  Afanasyev 2023-2024 LW Left
Sebastian Aho 2017-2024 F Left
Sebastian Aho 2018-2024 D Left
Nikita Alexandrov 2023-2024 C Left
Alexander Alexeyev 2022-2024 D Left
Jake Allen 2013-2024 G Not Found
Michael Amadio 2018-2024 C Right
Nils Ãman 2023-2024 C Left
Frederik Andersen 2014-2024 G Not Found
Joey Anderson 2019-2024 RW Right
Josh Anderson 2015-2024 RW Right
Michael Anderson 2020-2024 D Left
Jaret Anderson-Dolan 2019-2024 C Left
Rasmus Andersson 2017-2024 D Right
Emil Andrae 2024 D Left
Tyler Angle 2023-2024 C Left
Justus Annunen 2022-2024 G Not Found
Kenneth Appleby 2018-2024 G Not Found
Mason Appleton 2019-2024 C Right
Joel Armia 2015-2024 RW Right
Viktor Arvidsson 2015-2024 LW Right
Yaroslav Askarov 2023-2024 G Not Found
Zach Aston-Reese 2018-2024 C Left
Andreas Athanasiou 2016-2024 C Left
Cam Atkinson 2012-2024 RW Right
Ronald Attard 2022-2024 D Right
Nicolas AubÃ©-Kubel 2019-2024 RW Right
Mikael Backlund 2009-202

: 

In [92]:
print(df)



                    name                                               link  \
0           Noel Acciari  https://www.hockey-reference.com/players/a/acc...   
1          Calen Addison  https://www.hockey-reference.com/players/a/add...   
2        Egor  Afanasyev  https://www.hockey-reference.com/players/a/afa...   
3          Sebastian Aho  https://www.hockey-reference.com/players/a/aho...   
4          Sebastian Aho  https://www.hockey-reference.com/players/a/aho...   
...                  ...                                                ...   
1002  Parker Wotherspoon  https://www.hockey-reference.com/players/w/wot...   
1003        Shane Wright  https://www.hockey-reference.com/players/w/wri...   
1004     Kailer Yamamoto  https://www.hockey-reference.com/players/y/yam...   
1005       Jesse YlÃ¶nen  https://www.hockey-reference.com/players/y/ylo...   
1006        Cameron York  https://www.hockey-reference.com/players/y/yor...   

       position     shoots years_active  
0        