In [1]:
import time
import pandas as pd
from datetime import datetime

def find_pmqs(start_date, end_date, term, print_url=True):
    
    # Load Selenium, module for browsing
    import undetected_chromedriver as uc
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    
    # Prepare url with search term, start and end dates
    root_url = "https://hansard.parliament.uk/search/Debates?startDate=%s&endDate=%s&searchTerm=%s&house=Commons&partial=False"
    search_term = term.replace(" ","%20")
    url = root_url % (start_date, end_date, search_term)
    if print_url:
        print(url)
    
    # Make request to Hansard url with Selenium
    options = webdriver.ChromeOptions()
    browser = uc.Chrome(options=options)
    #browser = webdriver.Firefox()
    browser.get(url)
    
    # Wait for page to load
    time.sleep(5)
    
    # Save search results to a list
    try:
        dates_res = browser.find_element(By.CLASS_NAME, "card-list").text.split("\n")
        #search_res = browser.find_element_by_class_name("card-list").text.split("\n")
    except NoSuchElementException:
        dates_res = None
        temp_pmqs_dates = None
    
    try:
        urls_res = [x.get_property("href") for x in browser.find_element(By.CLASS_NAME, "card-list").find_elements(By.CLASS_NAME, "card-calendar")]
        #search_res = [x.get_property("href") for x in browser.find_element_by_class_name("card-list").find_elements_by_class_name("card-calendar")]
        if len(urls_res)==1:
            print(f"{len(urls_res)} search result for selected time period")
        else: print(f"{len(urls_res)} search results for selected time period")
    except NoSuchElementException:
        print("No search results for selected time period")
        urls_res = None
        
    browser.quit()
    
    if dates_res: 
        temp_pmqs_dates = []
        for i in range(0,len(dates_res),8):
            temp_pmqs_dates.append(dates_res[i+5])
        temp_pmqs_dates = [datetime.strftime(datetime.strptime(x,"%d %B %Y"),"%Y-%m-%d") for x in temp_pmqs_dates]
    
    return temp_pmqs_dates,urls_res

In [2]:
from dateutil.relativedelta import *
from datetime import timedelta

date_ceiling = datetime.now()

startdate = datetime.strptime("1979-05-04","%Y-%m-%d")
enddate = startdate + relativedelta(days=+29)

pmqs_dates = list()
pmqs_urls = list()

tot_run_time = timedelta()
n = 0

while startdate <= date_ceiling:
    
    go_time = datetime.now()
    
    print(f"Start: {datetime.strftime(startdate,'%d-%m-%Y')}; End: {datetime.strftime(enddate,'%d-%m-%Y')}")
    
    date_res,url_res = find_pmqs(startdate, enddate, "Prime Minister", print_url = False)
    if date_res:
        pmqs_dates.extend(date_res)
    if url_res:
        pmqs_urls.extend(url_res)
        
    startdate = startdate + relativedelta(days=+30)
    enddate = enddate + relativedelta(days=+30)
    
    run_time = datetime.now()-go_time
    tot_run_time += run_time
    n += 1
    avg_run_time = tot_run_time/n
    
    months_left_to_run = ((date_ceiling-startdate)/30).days
    estimated_runtime = avg_run_time*months_left_to_run
    print(f"{months_left_to_run} months left to run, estimated runtime remaining: {estimated_runtime}")

Start: 04-05-1979; End: 02-06-1979
3 search results for selected time period
522 months left to run, estimated runtime remaining: 2:14:05.625672
Start: 03-06-1979; End: 02-07-1979
6 search results for selected time period
521 months left to run, estimated runtime remaining: 2:02:54.416350
Start: 03-07-1979; End: 01-08-1979
8 search results for selected time period
520 months left to run, estimated runtime remaining: 1:59:25.609360
Start: 02-08-1979; End: 31-08-1979
No search results for selected time period
519 months left to run, estimated runtime remaining: 2:18:41.287371
Start: 01-09-1979; End: 30-09-1979
No search results for selected time period
518 months left to run, estimated runtime remaining: 2:12:30.934810
Start: 01-10-1979; End: 30-10-1979
4 search results for selected time period
517 months left to run, estimated runtime remaining: 2:08:18.655789
Start: 31-10-1979; End: 29-11-1979
15 search results for selected time period
516 months left to run, estimated runtime remainin

In [14]:
pmqs_dates = pd.Series(pmqs_dates, name='date')
pmqs_dates = pmqs_dates.drop_duplicates()
pmqs_dates.to_csv('pmqs_dates.csv')