This code is similar to the work here:
https://github.com/chord-analytics/nhl-goalie-pull-optimization/blob/master/notebooks/src/1_html_download.ipynb

In [1]:
from IPython.display import HTML
HTML('<style>div.text_cell_render{font-size:130%;}</style>')

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import re
import datetime
import time
from collections import OrderedDict
import glob
from tqdm import tqdm_notebook
from colorama import Fore, Style

In [3]:
def random_wait(mu) -> float:
    ''' Positive stochastic var with average of mu '''
    return np.random.beta(3, 3) * mu * 2

def init_sess(sess=None):
    if sess is not None:
        sess.close()
    _sess = requests.Session()
    return _sess

def get_page(sess, url, tries=0) -> str:
    try:
        if tries > 3:
            print(f'Scrape failed at URL = {url}')
            return None

        print(f'Requesting HTML for URL = {url}')
        page = sess.get(url)
        print(f'Got {page.status_code} status code')
        
        if page.status_code == 404:
            print('Bad status code, returning no page')
            return None

        if page.status_code not in (200, 404):
            print('Bad status code, waiting 10 seconds and trying again')
            time.sleep(10)
            sess = init_sess(sess)
            return get_page(sess, url, tries+1)

        return page.text

    except Exception as e:
        print(f'Exception: {str(e)}')
        print('Sleeping, then trying again...')
        time.sleep(10)
        sess = init_sess(sess)
        return get_page(sess, url, tries+1)
        
        
def download_game_range(
    url_template,
    seasons,
    games,
    no_break=False
    ) -> None:
    
    root_data_path = '../../data/raw/html'
    if not os.path.exists(root_data_path):
        os.makedirs(root_data_path)
        print(f'Making dirs {root_data_path}')
        
    request_delay = 3

    print(f'Starting data pull at {datetime.datetime.now()}')

    sess = init_sess()
    for season in seasons:
        data_path = os.path.join(root_data_path, season)
        if not os.path.exists(data_path):
            print(f'Making dirs {data_path}')
            os.makedirs(data_path)
        
        for game_num in games: 
            time.sleep(random_wait(request_delay))

            page_text = get_page(
                sess,
                url_template.format(season, game_num)
            )
            if page_text is None:
                if no_break:
                    print('Bad response, trying next page')
                    continue
                print(f'Season = {season}')
                print(f'Max game = {game_num - 1}')
                break

            f_name = os.path.join(data_path, f'{game_num}.html')
            print(f'Writing HTML to file {f_name}')
            with open(f_name, 'w') as f:
                f.write(page_text)
                
        print(f'Done season {season}')
        if season != seasons[-1]:
            print('Waiting 10 minutes...')
            time.sleep(10*60)

    print(f'Ending data pull at {datetime.datetime.now()}')

In [10]:
%%time
url_tempalte = 'http://www.nhl.com/scores/htmlreports/{:}/PL02{:04d}.HTM'
seasons = ['20212022']
games = list(range(1, 5000))

download_game_range(url_tempalte, seasons, games)

Starting data pull at 2021-12-31 01:13:22.194419
Making dirs ../../data/raw/html\20212022
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020001.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\1.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020002.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\2.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020003.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\3.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020004.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\4.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020005.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\5.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020006.HTM
Got 200 s

Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\50.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020051.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\51.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020052.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\52.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020053.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\53.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020054.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\54.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020055.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\55.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020056.HTM
Got 200 status c

Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020101.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\101.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020102.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\102.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020103.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\103.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020104.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\104.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020105.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\105.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020106.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\106.html
Requesting

Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020151.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\151.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020152.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\152.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020153.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\153.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020154.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\154.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020155.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\155.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020156.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\156.html
Requesting

Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020201.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\201.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020202.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\202.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020203.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\203.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020204.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\204.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020205.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\205.html
Requesting HTML for URL = http://www.nhl.com/scores/htmlreports/20212022/PL020206.HTM
Got 200 status code
Writing HTML to file ../../data/raw/html\20212022\206.html
Requesting