In [1]:
import time
import pandas as pd
from datetime import datetime

def find_pmqs(start_date, end_date, term, print_url=True):
    
    # Load Selenium, module for browsing
    import undetected_chromedriver as uc
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    
    # Check what the installed main version of Chrome is - this is needed to launch the chromedriver correctly
    from win32api import HIWORD, LOWORD, GetFileVersionInfo
    def get_chrome_main_version():
        filename = uc.find_chrome_executable()
        info = GetFileVersionInfo(filename, "\\")
        version = (
            HIWORD(info["FileVersionMS"]),
            LOWORD(info["FileVersionMS"]),
            HIWORD(info["FileVersionLS"]),
            LOWORD(info["FileVersionLS"]),
        )
        print(f"Chrome version={'.'.join(map(str, version))}")
        return version[0]
    version_main = get_chrome_main_version()
    
    # Prepare url with search term, start and end dates
    root_url = "https://hansard.parliament.uk/search/Debates?startDate=%s&endDate=%s&searchTerm=%s&house=Commons&partial=False"
    search_term = term.replace(" ","%20")
    url = root_url % (start_date, end_date, search_term)
    if print_url:
        print(url)
        
    # Make request to Hansard url with Selenium
    options = webdriver.ChromeOptions()
    browser = uc.Chrome(version_main=version_main, options=options)
    #browser = webdriver.Firefox()
    browser.get(url)
    
    # Wait for page to load
    time.sleep(5)
    
    # Save search results to a list
    try:
        dates_res = browser.find_element(By.CLASS_NAME, "card-list").text.split("\n")
        #search_res = browser.find_element_by_class_name("card-list").text.split("\n")
    except NoSuchElementException:
        dates_res = None
        temp_pmqs_dates = None
    
    try:
        urls_res = [x.get_property("href") for x in browser.find_element(By.CLASS_NAME, "card-list").find_elements(By.CLASS_NAME, "card-calendar")]
        #search_res = [x.get_property("href") for x in browser.find_element_by_class_name("card-list").find_elements_by_class_name("card-calendar")]
        if len(urls_res)==1:
            print(f"{len(urls_res)} search result for selected time period")
        else: print(f"{len(urls_res)} search results for selected time period")
    except NoSuchElementException:
        print("No search results for selected time period")
        urls_res = None
        
    browser.quit()
    
    if dates_res: 
        temp_pmqs_dates = []
        for i in range(0,len(dates_res),8):
            temp_pmqs_dates.append(dates_res[i+5])
        temp_pmqs_dates = [datetime.strftime(datetime.strptime(x,"%d %B %Y"),"%Y-%m-%d") for x in temp_pmqs_dates]
    
    return temp_pmqs_dates,urls_res

In [2]:
from dateutil.relativedelta import *
from datetime import timedelta

date_ceiling = datetime.now()

startdate = datetime.strptime("1945-01-01","%Y-%m-%d")
enddate = startdate + relativedelta(days=+29)

pmqs_dates = list()
pmqs_urls = list()

tot_run_time = timedelta()
n = 0

while startdate <= date_ceiling:
    
    go_time = datetime.now()
    
    print(f"Start: {datetime.strftime(startdate,'%d-%m-%Y')}; End: {datetime.strftime(enddate,'%d-%m-%Y')}")
    
    date_res,url_res = find_pmqs(startdate, enddate, "Prime Minister", print_url = False)
    if date_res:
        pmqs_dates.extend(date_res)
    if url_res:
        pmqs_urls.extend(url_res)
        
    startdate = startdate + relativedelta(days=+30)
    enddate = enddate + relativedelta(days=+30)
    
    run_time = datetime.now()-go_time
    tot_run_time += run_time
    n += 1
    avg_run_time = tot_run_time/n
    
    months_left_to_run = ((date_ceiling-startdate)/30).days
    estimated_runtime = avg_run_time*months_left_to_run
    print(f"{months_left_to_run} months left to run, estimated runtime remaining: {estimated_runtime}")

Start: 01-01-1945; End: 30-01-1945
Chrome version=101.0.4951.67
1 search result for selected time period
941 months left to run, estimated runtime remaining: 2:48:56.509401
Start: 31-01-1945; End: 01-03-1945
Chrome version=101.0.4951.67
1 search result for selected time period
940 months left to run, estimated runtime remaining: 2:44:49.574560
Start: 02-03-1945; End: 31-03-1945
Chrome version=101.0.4951.67
No search results for selected time period
939 months left to run, estimated runtime remaining: 3:08:39.669414
Start: 01-04-1945; End: 30-04-1945
Chrome version=101.0.4951.67
No search results for selected time period
938 months left to run, estimated runtime remaining: 3:02:14.258496
Start: 01-05-1945; End: 30-05-1945
Chrome version=101.0.4951.67
No search results for selected time period
937 months left to run, estimated runtime remaining: 2:57:30.465783
Start: 31-05-1945; End: 29-06-1945
Chrome version=101.0.4951.67
No search results for selected time period
936 months left to run

In [3]:
pmqs_dates = pd.Series(pmqs_dates, name='date')
pmqs_dates = pmqs_dates.drop_duplicates()
pmqs_dates.to_csv('pmqs_dates.csv')