In [29]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# standard beautiful soup calls
def get_page(url):
    return requests.get(url)
def make_soup(page):
    return BeautifulSoup(page.text, 'html')

test_url = "https://www.hockey-reference.com/players/a/acciano01.html"

# Generating links
We want to generate links based on our pre-existing links and years played combination so we can initiate the large scrape - in the order of ~6000 scrapes, so will take 5 hours to execute.

In [30]:
# Going to call active_players_additional.csv and collect the link and generate all possible gamelogs
df = pd.read_csv('active_players_additional.csv', sep='\t')


In [37]:
years_active = df.at[0, "years_active"]
link = df.at[0, "link"]
name = df.at[0, "name"]

def generate_gamelog_links(link, active_years):
    #print(link, active_years)
    # first we have to disambiguate the active_years
    pattern = r"(^[0-9]+)-([0-9]+$)"
    result = re.findall(pattern, active_years)
    if len(result) == 0:
        start = int(active_years)
        end = int(active_years)
    else:
        start = int(result[0][0])
        end = int(result[0][1])

    # then we generate all the potential links (there could be gaps in between)
    links = []
    for i in range(start, end+1):
        log_link = link[:-5] + "/gamelog/" + str(i)
        links.append(log_link)

    #print(links)
    return links

game_log_links = generate_gamelog_links(link, years_active)


In [32]:
# here we want to scrape a specific gamelog, we will scrape all the information we can so we can potentially use it later

# the information we have here is:
#   Game #, Date, Age, Team, Opposition, Win/Loss, Goals, Assists, Points, +/-, Penalty Minutes, Even Strength Goals, Powerplay Goals, Shorthanded Goals, Game Winning Goals, Even Strength Assists, Powerplay Assists, Shorthanded Assists, Shots, Shot Percentage, # of shifts, Time on Ice, Hits, Blocks, Face off wins, Face off Losses, Face off %

test_link = "https://www.hockey-reference.com/players/a/acciano01/gamelog/2017"

def scrape_log(link, game_log_dict):
    # this function scrapes a particular gamelog based on the given link
    # so it scrapes one years worth of data

    # generate the soup
    soup = make_soup(get_page(link))
    log_table = soup.find_all('table', id="gamelog")
    if len(log_table) == 1:
        log_table = log_table[0]

        # identifiers and team outcome
        dates = log_table.find_all(attrs={"data-stat":"date_game", "aria-label":None})
        age = log_table.find_all(attrs={"data-stat":"age", "aria-label":None})
        teams = log_table.find_all(attrs={"data-stat":"team_id", "aria-label":None})
        opp = log_table.find_all(attrs={"data-stat":"opp_id", "aria-label":None})
        game_result = log_table.find_all(attrs={"data-stat":"game_result", "aria-label":None})

        # all situations results
        all_goals = log_table.find_all(attrs={"data-stat":"goals", "aria-label":None})
        all_assists = log_table.find_all(attrs={"data-stat":"assists", "aria-label":None})
        all_points = log_table.find_all(attrs={"data-stat":"points", "aria-label":None})
        plus_minus = log_table.find_all(attrs={"data-stat":"plus_minus", "aria-label":None})
        pims = log_table.find_all(attrs={"data-stat":"pen_min", "aria-label":None})

        # goals in detail
        evg = log_table.find_all(attrs={"data-stat":"goals_ev", "aria-label":None})
        ppg = log_table.find_all(attrs={"data-stat":"goals_pp", "aria-label":None})
        shg = log_table.find_all(attrs={"data-stat":"goals_sh", "aria-label":None})
        gwg = log_table.find_all(attrs={"data-stat":"goals_gw", "aria-label":None})

        # assists in detail
        eva = log_table.find_all(attrs={"data-stat":"assists_ev", "aria-label":None})
        ppa = log_table.find_all(attrs={"data-stat":"assists_pp", "aria-label":None})
        sha = log_table.find_all(attrs={"data-stat":"assists_sh", "aria-label":None})

        # shot data
        shots = log_table.find_all(attrs={"data-stat":"shots", "aria-label":None})
        shot_pct = log_table.find_all(attrs={"data-stat":"shot_pct", "aria-label":None})

        # playing time data
        num_shifts = log_table.find_all(attrs={"data-stat":"shifts", "aria-label":None})
        toi = log_table.find_all(attrs={"data-stat":"time_on_ice", "aria-label":None})
        hits = log_table.find_all(attrs={"data-stat":"hits_all", "aria-label":None})

        # Faceoffs
        FO_win = log_table.find_all(attrs={"data-stat":"faceoff_wins_all", "aria-label":None})
        FO_Loss = log_table.find_all(attrs={"data-stat":"faceoff_losses_all", "aria-label":None})
        FO_pct = log_table.find_all(attrs={"data-stat":"faceoff_percentage_all", "aria-label":None})

        # now we need to loop through all of this data and add it to the dictionary
        for i in range(len(dates)):
            # identifiers and team outcome
            game_log_dict["date"].append(dates[i].text)
            game_log_dict["age"].append(age[i].text)
            game_log_dict["team"].append(teams[i].text)
            game_log_dict["opposition"].append(opp[i].text)
            game_log_dict["game_result"].append(game_result[i].text)

            # all situations results
            game_log_dict["all_goals"].append(all_goals[i].text)
            game_log_dict["all_assists"].append(all_assists[i].text)
            game_log_dict["all_points"].append(all_points[i].text)
            game_log_dict["plus_minus"].append(plus_minus[i].text)
            game_log_dict["pims"].append(pims[i].text)

            # goals in detail
            game_log_dict["evg"].append(evg[i].text)
            game_log_dict["ppg"].append(ppg[i].text)
            game_log_dict["shg"].append(shg[i].text)
            game_log_dict["gwg"].append(gwg[i].text)

            # assists in detail
            game_log_dict["eva"].append(eva[i].text)
            game_log_dict["ppa"].append(ppa[i].text)
            game_log_dict["sha"].append(sha[i].text)

            # shot data
            game_log_dict["shots"].append(shots[i].text)
            game_log_dict["shot_pct"].append(shot_pct[i].text)

            # playing time data
            game_log_dict["num_shifts"].append(num_shifts[i].text)
            game_log_dict["toi"].append(toi[i].text)
            game_log_dict["hits"].append(hits[i].text)

            # Faceoffs
            game_log_dict["FO_win"].append(FO_win[i].text)
            game_log_dict["FO_Loss"].append(FO_Loss[i].text)
            game_log_dict["FO_pct"].append(FO_pct[i].text)

init_dictionary = {
    "date":[],
    "age":[],
    "team":[],
    "opposition":[],
    "game_result":[],

    "all_goals":[],
    "all_assists":[],
    "all_points":[],
    "plus_minus":[],
    "pims":[],

    "evg":[],
    "ppg":[],
    "shg":[],
    "gwg":[],

    "eva":[],
    "ppa":[],
    "sha":[],

    "shots":[],
    "shot_pct":[],

    "num_shifts":[],
    "toi":[],
    "hits":[],

    "FO_win":[],
    "FO_Loss":[],
    "FO_pct":[]
}
#scrape_log(test_link, init_dictionary)
#print(init_dictionary)

In [33]:
# Now we put together both to scrape one players entire game logs
import time as t


# This functions scrapes and stores one player's information
def scrape_logs_and_store(df_row, number):
    # start the timer
    name = df_row.name
    main_link = df_row.link
    years_active = df_row.years_active
    # get the player links
    player_links = generate_gamelog_links(main_link, years_active)
    print("Time estimate to process", df_row.name, "is", 5*len(player_links), "seconds, as they have", len(player_links), "logs.")

    init_dictionary = {
        "date":[],
        "age":[],
        "team":[],
        "opposition":[],
        "game_result":[],

        "all_goals":[],
        "all_assists":[],
        "all_points":[],
        "plus_minus":[],
        "pims":[],

        "evg":[],
        "ppg":[],
        "shg":[],
        "gwg":[],

        "eva":[],
        "ppa":[],
        "sha":[],

        "shots":[],
        "shot_pct":[],

        "num_shifts":[],
        "toi":[],
        "hits":[],

        "FO_win":[],
        "FO_Loss":[],
        "FO_pct":[]
    }


    # scrape every link
    for link in player_links:
        #print(link)
        start = t.time()
        scrape_log(link, init_dictionary)
        end = t.time()
        t.sleep(max(0, 5-(end-start)))
        
    player_df = pd.DataFrame.from_dict(init_dictionary)
    player_df.to_csv("data/"+name+str(number)+"_gamelogs.csv", encoding="utf8")



    

In [38]:
import os

name_dictionary = {}
SKIP_OVERWRITE = True
# keeps track of how many times each name has been present
for row in df.itertuples(name='Pandas'):
    if row.position == "G":
        continue
    else:
        if row.name in name_dictionary:
            name_dictionary[row.name] += 1
        else:
            name_dictionary[row.name] = 1
        
        if SKIP_OVERWRITE:
            if os.path.isfile("data/"+row.name+str(name_dictionary[row.name])+"_gamelogs.csv"):
                print("Skipped", row.name, "as his file already exists, and program is set to not overwrite.")
            else:
                scrape_logs_and_store(row, name_dictionary[row.name])
        else:
            scrape_logs_and_store(row, name_dictionary[row.name])

Skipped Noel Acciari as his file already exists, and program is set to not overwrite.
Skipped Calen Addison as his file already exists, and program is set to not overwrite.
Skipped Egor  Afanasyev as his file already exists, and program is set to not overwrite.
Skipped Sebastian Aho as his file already exists, and program is set to not overwrite.
Skipped Sebastian Aho as his file already exists, and program is set to not overwrite.
Skipped Nikita Alexandrov as his file already exists, and program is set to not overwrite.
Skipped Alexander Alexeyev as his file already exists, and program is set to not overwrite.
Skipped Michael Amadio as his file already exists, and program is set to not overwrite.
Skipped Nils Ãman as his file already exists, and program is set to not overwrite.
Skipped Joey Anderson as his file already exists, and program is set to not overwrite.
Skipped Josh Anderson as his file already exists, and program is set to not overwrite.
Skipped Michael Anderson as his fil