In [1]:
import json
import os
import urllib.request
import sys
from tqdm import tqdm

from urllib.error import HTTPError

In [2]:
def download_game_data(dir_path, game_id) -> None:
    """
    Download data of a specific game into a particular dir path
    :param dir_path: Path to the dir
    :param game_id: Game id of the game that we want to download
    :return: none
    """
    file_path = os.path.join(dir_path, game_id+".json")
    
    # Return if file path already exists
    if(os.path.exists(file_path)):
            return
        
    try:
        with urllib.request.urlopen("https://statsapi.web.nhl.com/api/v1/game/" + game_id + "/feed/live/") as url:
            data = json.load(url)
            if "messageNumber" in data and "message" in data \
                and data["messageNumber"] == 2 and data["message"] == "Game data couldn't be found":
                pass
            else:
                with open(file_path, 'w') as outfile:
                    json.dump(data, outfile)
    except HTTPError as he:
        print(game_id)
        print(he.reason)
    except Exception:
        print('game_id: '+str(game_id))
        e_type, e_value, e_traceback = sys.exc_info()
        print(e_value)

In [3]:
def download_raw_data(target_year, dir_path):
    """
    Download data of all games in a specific year
    :param target_year: The year that we want to get data
    :param dir_path: Path to the directory we want to store data (not including year) 
    :return: none
    """
    # Common variables and processing
    available_years = ['2016', '2017', '2018', '2019', '2020']
    
    if(target_year not in available_years):
        print("Dataset does not contain the entered year")
        return
        
    # Declare dir path
    regular_dir_path = os.path.join(dir_path, target_year, 'regular_games')
    playoff_dir_path = os.path.join(dir_path, target_year, 'playoff_games')
    
    # Create dir if it does not exist
    if not os.path.exists(regular_dir_path):
        os.makedirs(regular_dir_path)
    if not os.path.exists(playoff_dir_path):
        os.makedirs(playoff_dir_path)
    
    # Download data of regular games
    print("Starting download data for regular games of season "+target_year)
    
    # Season 2016 has 1230 games, while the rest have 1271
    ID_range = 1231 if (target_year=='2016') else 1271
    
    for ID in tqdm(range(1, ID_range)):
        # Convert ID from integer to string
        ID_str =  "0"*(4 - len(str(ID))) + str(ID)
        regular_game_id = target_year+"02"+ID_str
        
        # Download data of each game
        download_game_data(regular_dir_path, regular_game_id)
    
    # Download data of playoff games
    print("Starting download data for playoff games of season "+target_year)
    
    # There are 4 rounds in total
    for round_number in tqdm(range(1, 5)):
        # round 1 has 8 matchups, round 2 has 4 matchups and so on
        number_of_matchups = int(2**(3-round_number))
        for matchup_number in range(1, number_of_matchups+1):
            # Each match up has 7 games in total
            for game_number in range(1, 8):
                playoff_game_id = target_year+"030"+str(round_number)+str(matchup_number)+str(game_number)
                download_game_data(playoff_dir_path, playoff_game_id)

In [4]:
download_raw_data('2018', "raw_data")

Starting download data for regular games of season 2018


100%|██████████████████████████████████████████████████████████████████████████████| 1270/1270 [06:23<00:00,  3.31it/s]


Starting download data for playoff games of season 2018


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.16s/it]
