#### Author: Ernie Sumoso

In [88]:
import re
import bs4
import requests
import datetime
import pathlib

def download_pdfs(url_domain, url_path, url_sufix, save_path, year):
    # Set the URL, which changes depending on the year
    current_year = datetime.datetime.now().year
    url = url_domain + url_path
    if year != current_year:
        url += url_sufix + str(year)
        
    # Get the request from the bank URL
    website_request = requests.get(url)
    assert website_request.status_code == 200
    soup = bs4.BeautifulSoup(website_request.content)
    
    # Filter the HTML tags into only PDF links
    a_tags = soup.findAll('a')
    pdf_tags = list(filter(lambda a_tag : a_tag.text == 'PDF' , a_tags))
    
    # Create the PDF saving path if it doesn't exist
    save_path += f'/{year}'
    pathlib.Path(save_path).mkdir(parents=True, exist_ok=True)
    print(f'Starting to download {len(pdf_tags)} PDFs from {year} ({url_domain})...')
    
    # Loop through the PDFs and save them into the save path
    for i in range(len(pdf_tags)):
        
        # Set the pdf url extracted from the href attribute from a tags
        url_pdf = pdf_tags[i]['href']
        if not url_pdf.startswith('http'):
            url_pdf = url_domain + url_pdf
        
        # Request the PDF file and save it
        pdf_request = requests.get(url_pdf)
        assert pdf_request.status_code == 200
        pdf_file = open(save_path + f'/TD{year}_{i+1}.pdf', 'wb')
        pdf_file.write(pdf_request.content)
        pdf_file.close()
        
    print(f'Download successful: {len(pdf_tags)} PDFs from {year} ({url_domain})')

In [89]:
DOMAIN = "https://www.td.com"
URL = "/ca/en/about-td/for-investors/investor-relations/financial-information/financial-reports/quarterly-results"
URL_SUFIX = "/quarterly-results-"
SAVE_PATH = 'TDBank/'

download_pdfs(DOMAIN, URL, URL_SUFIX, SAVE_PATH, 1999)

Starting to download 7 PDFs from 1999 (https://www.td.com)...
Download successful: 7 PDFs from 1999 (https://www.td.com)
