In [12]:
import time
from datetime import datetime

def find_pmqs_dates(start_date, end_date, term, print_url=True):
    
    # Load Selenium, module for browsing
    import undetected_chromedriver as uc
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    
    # Prepare url with search term, start and end dates
    root_url = "https://hansard.parliament.uk/search/Debates?startDate=%s&endDate=%s&searchTerm=%s&house=Commons&partial=False"
    search_term = term.replace(" ","%20")
    url = root_url % (start_date, end_date, search_term)
    if print_url:
        print(url)
    
    # Make request to Hansard url with Selenium
    options = webdriver.ChromeOptions() 
    options.add_argument("start-maximized")
    browser = uc.Chrome(options=options)
    #browser = webdriver.Firefox()
    browser.get(url)
    
    # Wait for page to load
    time.sleep(5)
    
    # Save search results to a list
    try:
        search_res = browser.find_element(By.CLASS_NAME, "card-list").text.split("\n")
#         search_res = browser.find_element_by_class_name("card-list").text.split("\n")
    except NoSuchElementException:
        print("No search results for selected time period")
        search_res = None
        temp_pmqs_dates = None
    browser.quit()
    
    if search_res: 
        temp_pmqs_dates = []
        for i in range(0,len(search_res),8):
            temp_pmqs_dates.append(search_res[i+5])

        temp_pmqs_dates = [datetime.strftime(datetime.strptime(x,"%d %B %Y"),"%Y-%m-%d") for x in pmqs_dates]
    
    return temp_pmqs_dates

In [13]:
def get_target_hansard_urls(start_date, end_date, term, print_url=True):
    # Load packages
    import undetected_chromedriver as uc
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    
    # Construct url
    root_url = "https://hansard.parliament.uk/search/Debates?startDate=%s&endDate=%s&searchTerm=%s&house=Commons&partial=False"
    search_term = term.replace(" ","%20")
    url = root_url % (start_date, end_date, search_term)
    if print_url:
        print(url)
    
    # Go to url through Chrome browser
    options = webdriver.ChromeOptions() 
    options.add_argument("start-maximized")
    browser = uc.Chrome(options=options)
    browser.get(url)

    # Let url load completely
    time.sleep(5)
    
    # From the webpage, look at the list of results, then look at each of the tiles on the list
    # For each tile, get the associated link to the Hansard extract
    try:
        search_res = [x.get_property("href") for x in browser.find_element(By.CLASS_NAME, "card-list").find_elements(By.CLASS_NAME, "card-calendar")]
#         search_res = [x.get_property("href") for x in browser.find_element_by_class_name("card-list").find_elements_by_class_name("card-calendar")]
    except NoSuchElementException:
        print("No search results for selected time period")
        search_res = None
    browser.quit()

    return search_res

In [14]:
from dateutil.relativedelta import *

startdate = datetime.strptime("1945-01-01","%Y-%m-%d")
enddate = startdate + relativedelta(days=+29)

pmqs_dates = list()
pmqs_urls = list()

while startdate <= datetime.strptime("1946-01-01","%Y-%m-%d"): #datetime.today():
    print(f"Start: {datetime.strftime(startdate,'%d-%m-%Y')}; End: {datetime.strftime(enddate,'%d-%m-%Y')}")
    dates_res = find_pmqs_dates(startdate, enddate, "Prime Minister", print_url = False)
    if dates_res:
        pmqs_dates.extend(dates_res)
    urls_res = get_target_hansard_urls(startdate, enddate, "Prime Minister", print_url = False)
    if urls_res:
        pmqs_urls.extend(urls_res)
    startdate = startdate + relativedelta(days=+30)
    enddate = enddate + relativedelta(days=+30)

Start: 01-01-1945; End: 30-01-1945
Start: 31-01-1945; End: 01-03-1945
Start: 02-03-1945; End: 31-03-1945
No search results for selected time period
No search results for selected time period
Start: 01-04-1945; End: 30-04-1945
No search results for selected time period
No search results for selected time period
Start: 01-05-1945; End: 30-05-1945
No search results for selected time period
No search results for selected time period
Start: 31-05-1945; End: 29-06-1945


KeyboardInterrupt: 

To do:
 * Combine date and url grabbing functions to widen the browser launching bottleneck
 * Return number of results for successful searches
 * Implement estimated time left

In [16]:
print(pmqs_dates)
print(pmqs_urls)

[]
['https://hansard.parliament.uk/Commons/1945-01-17/debates/e55ef3c6-ed8c-4ad5-b867-b86ceda9aad3/RegentAndPrimeMinister(Powers)', 'https://hansard.parliament.uk/Commons/1945-01-31/debates/d79f6b08-dd71-4438-9e24-5d0fa02a5505/GeneralFranco(LetterToPrimeMinister)']
