In [1]:
import time
from datetime import datetime

def find_pmqs(start_date, end_date, term, print_url=True):
    
    # Load Selenium, module for browsing
    import undetected_chromedriver as uc
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.by import By
    
    # Prepare url with search term, start and end dates
    root_url = "https://hansard.parliament.uk/search/Debates?startDate=%s&endDate=%s&searchTerm=%s&house=Commons&partial=False"
    search_term = term.replace(" ","%20")
    url = root_url % (start_date, end_date, search_term)
    if print_url:
        print(url)
    
    # Make request to Hansard url with Selenium
    options = webdriver.ChromeOptions()
    browser = uc.Chrome(options=options)
    #browser = webdriver.Firefox()
    browser.get(url)
    
    # Wait for page to load
    time.sleep(5)
    
    # Save search results to a list
    try:
        dates_res = browser.find_element(By.CLASS_NAME, "card-list").text.split("\n")
        #search_res = browser.find_element_by_class_name("card-list").text.split("\n")
    except NoSuchElementException:
        dates_res = None
        temp_pmqs_dates = None
    
    try:
        urls_res = [x.get_property("href") for x in browser.find_element(By.CLASS_NAME, "card-list").find_elements(By.CLASS_NAME, "card-calendar")]
        #search_res = [x.get_property("href") for x in browser.find_element_by_class_name("card-list").find_elements_by_class_name("card-calendar")]
        if len(urls_res)==1:
            print(f"{len(urls_res)} search result for selected time period")
        else: print(f"{len(urls_res)} search results for selected time period")
    except NoSuchElementException:
        print("No search results for selected time period")
        urls_res = None
        
    browser.quit()
    
    if dates_res: 
        temp_pmqs_dates = []
        for i in range(0,len(dates_res),8):
            temp_pmqs_dates.append(dates_res[i+5])
        temp_pmqs_dates = [datetime.strftime(datetime.strptime(x,"%d %B %Y"),"%Y-%m-%d") for x in temp_pmqs_dates]
    
    return temp_pmqs_dates,urls_res

To do:
 * Combine date and url grabbing functions to widen the browser launching bottleneck
 * Return number of results for successful searches
 * Implement estimated time left

In [2]:
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

def parse_hansard_url(target_url):

    # Make request to Hansard url with Selenium
    options = webdriver.ChromeOptions()
    browser = uc.Chrome(options=options)
    #browser = webdriver.Firefox()
    browser.get(target_url)

    # Wait for page to load
    time.sleep(5)

    d = browser.find_element(By.CLASS_NAME, "calendar-sheet").find_element(By.CLASS_NAME, "day").text
    m = browser.find_element(By.CLASS_NAME, "calendar-sheet").find_element(By.CLASS_NAME, "month").text
    y = browser.find_element(By.CLASS_NAME, "calendar-sheet").find_element(By.CLASS_NAME, "year").text

    debate_items = [x.find_element(By.CLASS_NAME, "contribution") for x in browser.find_element(By.CLASS_NAME, "article").find_elements(By.CLASS_NAME, "debate-item-contributiondebateitem")]

    debate = dict()

    debate['title'] = browser.find_element(By.CLASS_NAME, "hero-banner").find_element(By.TAG_NAME, "h1").text
    debate['date'] = d+"/"+m+"/"+y

    debate['contribs'] = dict()

    for contrib in debate_items:
        contrib_id = contrib.get_attribute("data-contribution-id")

        debate['contribs'][contrib_id] = dict()
        debate['contribs'][contrib_id]['speaker'] = contrib.find_element(By.CLASS_NAME, "attributed-to-details").find_element(By.CLASS_NAME, "primary-text").text
        debate['contribs'][contrib_id]['content'] = contrib.find_element(By.CLASS_NAME, "content").text

    browser.quit()
    
    return debate

In [3]:
from dateutil.relativedelta import *
from datetime import timedelta

date_ceiling = datetime.now()

startdate = datetime.strptime("1979-05-04","%Y-%m-%d")
enddate = startdate + relativedelta(days=+29)

pmqs_dates = list()
pmqs_urls = list()

tot_run_time = timedelta()
n = 0

while startdate <= date_ceiling: #datetime.today():
    
    go_time = datetime.now()
    
    print(f"Start: {datetime.strftime(startdate,'%d-%m-%Y')}; End: {datetime.strftime(enddate,'%d-%m-%Y')}")
    
    date_res,url_res = find_pmqs(startdate, enddate, "Prime Minister", print_url = False)
    if date_res:
        pmqs_dates.extend(date_res)
    if url_res:
        pmqs_urls.extend(url_res)
        
    startdate = startdate + relativedelta(days=+30)
    enddate = enddate + relativedelta(days=+30)
    
    run_time = datetime.now()-go_time
    tot_run_time += run_time
    n += 1
    avg_run_time = tot_run_time/n
    
    months_left_to_run = ((date_ceiling-startdate)/30).days
    estimated_runtime = avg_run_time*months_left_to_run
    print(f"{months_left_to_run} months left to run, estimated runtime remaining: {estimated_runtime}")

Start: 04-05-1979; End: 02-06-1979
3 search results for selected time period
519 months left to run, estimated runtime remaining: 1:25:07.924302
Start: 03-06-1979; End: 02-07-1979
6 search results for selected time period
518 months left to run, estimated runtime remaining: 1:17:03.529176
Start: 03-07-1979; End: 01-08-1979
8 search results for selected time period
517 months left to run, estimated runtime remaining: 1:13:39.927094
Start: 02-08-1979; End: 31-08-1979
No search results for selected time period
516 months left to run, estimated runtime remaining: 1:11:58.383876
Start: 01-09-1979; End: 30-09-1979
No search results for selected time period
515 months left to run, estimated runtime remaining: 1:09:55.212660
Start: 01-10-1979; End: 30-10-1979
4 search results for selected time period
514 months left to run, estimated runtime remaining: 1:08:19.306770
Start: 31-10-1979; End: 29-11-1979
15 search results for selected time period
513 months left to run, estimated runtime remainin

In [4]:
from tqdm import tqdm

debates = dict()

for i,url in tqdm(enumerate(pmqs_urls)):
    debates[pmqs_dates[i]] = parse_hansard_url(url)

602it [2:55:39, 17.51s/it]


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".calendar-sheet"}
  (Session info: chrome=98.0.4758.82)
Stacktrace:
Backtrace:
	Ordinal0 [0x01007AC3+2587331]
	Ordinal0 [0x00F9ADD1+2141649]
	Ordinal0 [0x00E93BB8+1063864]
	Ordinal0 [0x00EC01CE+1245646]
	Ordinal0 [0x00EC03CB+1246155]
	Ordinal0 [0x00EEA612+1418770]
	Ordinal0 [0x00ED86D4+1345236]
	Ordinal0 [0x00EE8A0A+1411594]
	Ordinal0 [0x00ED84A6+1344678]
	Ordinal0 [0x00EB53F6+1201142]
	Ordinal0 [0x00EB62E6+1204966]
	GetHandleVerifier [0x011ADF22+1680738]
	GetHandleVerifier [0x01260DBC+2413564]
	GetHandleVerifier [0x0109D151+563089]
	GetHandleVerifier [0x0109BF13+558419]
	Ordinal0 [0x00FA081E+2164766]
	Ordinal0 [0x00FA5508+2184456]
	Ordinal0 [0x00FA5650+2184784]
	Ordinal0 [0x00FAF5BC+2225596]
	BaseThreadInitThunk [0x771C6739+25]
	RtlGetFullPathName_UEx [0x77BC8AFF+1215]
	RtlGetFullPathName_UEx [0x77BC8ACD+1165]


In [5]:
import pickle

debates_pkl = open('debates.pkl','wb')
pickle.dump(debates, debates_pkl)
debates_pkl.close()