In [1]:
import pickle 
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

tqdm.pandas()

In [2]:
def read_xml_from_path(path):
    # Read from path - decode bytes to strings
    with open(path, 'rb') as file:
        xml = file.read().decode(errors='replace') # Replace unrecognised utf-8 tokens with '?'
        
    return xml

In [3]:
def find_pmqs_session_in_hansard(daily_hansard_xml, threshold=5):
    # Import BeautifulSoup to navigate through the daily Hansard sessions
    from bs4 import BeautifulSoup
    
    pmqs_session_name_list = ["prime minister's questions",
                              "questions to the prime minister",
                              "prime minister",
                              "the prime minister",
                              "oral answers to questions - prime minister",
                              "oral answers to questions - the prime minister",
                              "questions to the prime minister",
                              "oral answers to questions - questions to the prime minister",
                              "oral answers to questions - questions to prime minister",
                              "prime minister (engagements)",
                              "oral answers to questions - prime minister (engagements)"]
    
    bs = BeautifulSoup(daily_hansard_xml)
    major_headings = bs.find_all('major-heading') # PMQs is marked as a 'major-heading'
    
    # Find the 'major-heading' immediately before and after PMQs
    pmqs_start_heading = None
    for heading in major_headings:
        if heading.text.lower().strip() in pmqs_session_name_list or 'prime minister' in heading.text.lower():
            pmqs_start_heading = heading
            if pmqs_start_heading is None or heading==major_headings[-1]:
                return None
            else:
                pmqs_end_heading = major_headings[major_headings.index(heading)+1]
                next_sibs = pmqs_start_heading.find_next_siblings() # All the lines from the start-of-PMQs header onwards...

                if next_sibs.index(pmqs_end_heading)<threshold: # Use threshold to avoid double-headed sections
                    if major_headings.index(heading)+2>=len(major_headings):
                        return None
                    else:
                        pmqs_end_heading = major_headings[major_headings.index(heading)+2]

                previous_sibs = pmqs_end_heading.find_previous_siblings() # And all the lines from the end-of-PMQs header backwards
                # Get the index of the last line of PMQs and return the PMQs session
                end_idx = next_sibs.index(previous_sibs[0])+1
                return next_sibs[:end_idx]

In [4]:
def parse_pmqs_session(list_of_xml, date, memberid2personid_lookup):
    from datetime import datetime
    current_day = datetime.strftime(datetime.now(), format="%Y-%m-%d")
    # Set up PM dict to get name of the sitting Prime Minister at each date
    pm_list = [['Robert Gascoyne-Cecil',('1895-06-25','1902-07-11')],
               ['Arthur Balfour',('1902-07-12','1905-12-04')],
               ['Henry Campbell-Bannerman',('1905-12-05','1908-04-03')],
               ['H.H. Asquith',('1908-04-04','1916-12-05')],
               ['David Lloyd George',('1916-12-06','1922-10-19')],
               ['Andrew Bonar Lloyd',('1922-10-20','1923-05-20')],
               ['Stanley Baldwin',('1923-05-21','1924-01-22')],
               ['Ramsay MacDonald',('1924-01-23','1924-11-04')],
               ['Stanley Baldwin',('1924-11-05','1929-06-04')],
               ['Ramsay MacDonald',('1929-06-05','1935-06-07')],
               ['Stanley Baldwin',('1935-06-08','1937-05-28')],
               ['Neville Chamberlain',('1937-05-29','1940-05-09')],
               ['Winston Churchill',('1940-05-10','1945-07-26')],
               ['Clement Attlee',('1945-07-27','1951-10-26')],
               ['Winston Churchill',('1951-10-27','1955-04-05')],
               ['Anthony Eden',('1955-04-06','1957-01-09')],
               ['Harold Macmillan',('1957-01-10','1963-10-18')],
               ['Alec Douglas-Home',('1963-10-19','1964-10-16')],
               ['Harold Wilson',('1964-10-17','1970-06-19')],
               ['Edward Heath',('1970-06-20','1974-03-04')],
               ['Harold Wilson',('1974-03-05','1976-04-05')],
               ['James Callaghan',('1976-04-06','1979-05-04')],               
               ['Margaret Thatcher',('1979-05-05','1990-11-28')],
               ['John Major',('1990-11-29','1997-05-02')],
               ['Tony Blair',('1997-05-03','2007-06-27')],
               ['Gordon Brown',('2007-06-28','2010-05-11')],
               ['David Cameron',('2010-05-12','2016-07-13')],
               ['Theresa May',('2016-07-14','2019-07-24')],
               ['Boris Johnson',('2019-07-25',current_day)]]
    
    # Set up empty dict for PMQs questions and responses
    pmqs = {'questioner_id':[],
        'questioner_name':[],
        'question_text':[],
        'answerer_id':[],
        'answerer_name':[],
        'answer_text':[]}
    
    sitting_prime_minister = [pm for pm,(start_date,end_date) in pm_list if date>=start_date and date<=end_date][0].lower()

    for idx,answer in enumerate(list_of_xml):
        # Find entries in PMQs where the Prime Minister speaks
        if 'speakername' in answer.attrs.keys():
            if 'prime minister' in answer['speakername'].lower() or sitting_prime_minister in answer['speakername'].lower():
                # The question is the line preceding the answer
                question = list_of_xml[idx-1]

                if question is None or 'speakername' not in question.attrs.keys():
                    questioner_id,questioner_name,question_text = None,None,None
                else:
                    if 'person_id' in question.attrs.keys():
                        questioner_id = question['person_id']
                    else:
                        questioner_id = None
                        if question is not None and 'speakerid' in question.attrs.keys():
                            q_member_id = question['speakerid']
                            if q_member_id in memberid2personid_lookup.keys():
                                questioner_id = memberid2personid_lookup[q_member_id]          
                    questioner_name = question['speakername']
                    question_text = question.text

                if answer is None or 'speakername' not in answer.attrs.keys():
                    answerer_id,answerer_name,answer_text = None,None,None
                else:
                    if 'person_id' in answer.attrs.keys():
                        answerer_id = answer['person_id']
                    else:
                        answerer_id = None
                        if answer is not None and 'speakerid' in answer.attrs.keys():
                            a_member_id = answer['speakerid']
                            if a_member_id in memberid2personid_lookup.keys():
                                answerer_id = memberid2personid_lookup[a_member_id]          
                    answerer_name = answer['speakername']
                    answer_text = answer.text

                # Save these variables to the dict
                for variable in pmqs.keys():
                    pmqs[variable].append(eval(variable))
    
    return pd.DataFrame(pmqs)

In [5]:
# Some entries in Hansard are only tagged with MP IDs and not person IDs
# We can use people.csv to cross-tabulate between the two
people_df = pd.read_csv('../hansard-in-full/people.csv').drop('Unnamed: 0',axis=1)
lookup = lambda row: {member_id:row.person_id for member_id in eval(row.memberships)}
lookup_list = people_df.progress_apply(lookup, axis=1).to_list()
# Then we can make a lookup dict for cross-tabulation
memberid2personid_lookup = {member_id:person_id for lookup_dict in lookup_list for member_id,person_id in lookup_dict.items()}

# Get the list of potential PMQs dates scraped from Hansard using pmqs_scraper.ipynb
pmqs_dates = pd.read_csv('pmqs_dates.csv').date.to_list()

# Get debates filenames from the list of debate URLs stored in the 'hansard-in-full' project
filenames = pd.read_csv('../hansard-in-full/debate_urls.csv').url
filenames = [url.split('/')[-1] for url in filenames.to_list()]
# Select those filenames containing one of the selected dates
filename_dates = [filename[7:17] for filename in filenames] # Get the date part of each filename
pmqs_filenames = [filename for idx,filename in enumerate(filenames) if filename_dates[idx] in pmqs_dates]

debates_folder_path = '../hansard-in-full/debates_xml/'

# We also require the dataframe of MPs by IDs to get background info of questioners/answerers
people_df = pd.read_csv('../hansard-in-full/people.csv').drop('Unnamed: 0', axis=1)

df = parse_pmqs_session('', '2000-00-00', memberid2personid_lookup)

for filename in tqdm(pmqs_filenames):
    date = filename[7:17]
    path = debates_folder_path+filename
    xml = read_xml_from_path(path)
    sess = find_pmqs_session_in_hansard(xml)
    if sess is not None:
        pmqs = parse_pmqs_session(sess, date, memberid2personid_lookup)
        pmqs['date'] = date
        
        df = df.append(pmqs)

display(df)

100%|███████████████████████████████████████████████████████| 13976/13976 [00:00<00:00, 19429.96it/s]
100%|████████████████████████████████████████████████████████████| 2993/2993 [05:23<00:00,  9.24it/s]


Unnamed: 0,questioner_id,questioner_name,question_text,answerer_id,answerer_name,answer_text,date
0,uk.org.publicwhip/person/21680,Mr. A. Lewis,\nasked the Prime Minister if he will consider...,uk.org.publicwhip/person/16522,The Prime Minister (Mr. Attlee),\nI should propose on suitable occasions to ma...,1946-03-05
1,,Lieut.-Colonel Dower,\nWill the right hon. Gentleman give due weigh...,uk.org.publicwhip/person/16522,The Prime Minister,\nAnd so what?\n,1946-03-05
2,uk.org.publicwhip/person/21841,Mr. Godfrey Nicholson,\nAs it is impossible to separate entirely the...,uk.org.publicwhip/person/16522,The Prime Minister,\nThe general rule is that where there is any ...,1946-03-05
0,uk.org.publicwhip/person/18875,Brigadier Rayner,\nasked the Prime Minister whether representat...,uk.org.publicwhip/person/16522,The Prime Minister (Mr. Attlee),\nNo. Sir.\n,1951-01-24
1,uk.org.publicwhip/person/18875,Brigadier Rayner,"\nAs, on this occasion, the right hon. Gentlem...",uk.org.publicwhip/person/16522,The Prime Minister,\nThe hon. and gallant Member is entirely wron...,1951-01-24
...,...,...,...,...,...,...,...
22,uk.org.publicwhip/person/25366,Jeff Smith,\nLast night we learned that the Home Secretar...,uk.org.publicwhip/person/10999,Boris Johnson,\nI believe the Home Office has already made a...,2022-04-27
23,uk.org.publicwhip/person/25916,Aaron Bell,\nNewcastle-under-Lyme Borough Council has sec...,uk.org.publicwhip/person/10999,Boris Johnson,\nMy hon. Friend is an avid champion for his c...,2022-04-27
24,uk.org.publicwhip/person/10669,Hilary Benn,\nSome 4.5 million people pay for their gas an...,uk.org.publicwhip/person/10999,Boris Johnson,\nWe are working with Ofgem and all the compan...,2022-04-27
25,uk.org.publicwhip/person/25910,Ben Everitt,"\nWithin the past hour or so, it has been repo...",uk.org.publicwhip/person/10999,Boris Johnson,\nIt is no disrespect to those who have not be...,2022-04-27


In [6]:
def extend_current_speech_variables(row, threshold='2019-01-01'):
    # Import datetime to get current day
    from datetime import datetime
    current_day = datetime.strftime(datetime.now(), format="%Y-%m-%d")
    
    # Check constituencies isn't empty for this row
    if row.constituencies not in [None,[]] and not pd.isna(row.constituencies):
        if isinstance(row.constituencies, str):
            constituencies = eval(row.constituencies)
        else:
            constituencies = row.constituencies
        if constituencies.keys()!={}.keys(): # Check that dict is not empty
            # Find the latest date for an entry in the constituency dict
            latest_constituency_date = max([end_date for (start_date,end_date),constituency in constituencies.items()])
            # If latest date is after threshold, then extend entry to now
            if latest_constituency_date>=threshold:
                old_key = [key for key in eval(row.constituencies).keys() if key[1]==latest_constituency_date][0]
                new_key = (old_key[0],current_day)
                constituencies[new_key] = constituencies.pop(old_key)
    else:
        constituencies = None
    
    # Repeat for parties
    if row.parties not in [None,[]] and not pd.isna(row.parties):
        if isinstance(row.parties, str):
            parties = eval(row.parties)
        else:
            parties = row.parties
        if parties.keys()!={}.keys():
            latest_party_date = max([end_date for (start_date,end_date),party in parties.items()])
            if latest_party_date>=threshold:
                old_key = [key for key in eval(row.parties).keys() if key[1]==latest_party_date][0]
                new_key = (old_key[0],current_day)
                parties[new_key] = parties.pop(old_key)
    else:
         parties = None
    
    return constituencies,parties

constituencies_parties = people_df.progress_apply(extend_current_speech_variables, axis=1).to_list()
people_df['constituencies'] = [constituency[0] for constituency in constituencies_parties]
people_df['parties'] = [party[1] for party in constituencies_parties]

display(people_df)

100%|████████████████████████████████████████████████████████| 13976/13976 [00:01<00:00, 8087.92it/s]


Unnamed: 0,person_id,first_name,family_name,memberships,constituencies,parties
0,uk.org.publicwhip/person/10001,Diane,Abbott,"['uk.org.publicwhip/member/2069', 'uk.org.publ...","{('1987-06-11', '1992-03-16'): 'Hackney North ...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
1,uk.org.publicwhip/person/10002,Gerry,Adams,"['uk.org.publicwhip/member/2196', 'uk.org.publ...","{('1983-06-09', '1987-05-18'): 'Belfast West',...","{('1997-05-01', '2001-05-14'): 'Sinn Féin', ('..."
2,uk.org.publicwhip/person/10003,Irene,Adams,"['uk.org.publicwhip/member/2201', 'uk.org.publ...","{('1990-11-29', '1992-03-16'): 'Paisley North'...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
3,uk.org.publicwhip/person/10004,Nick,Ainger,"['uk.org.publicwhip/member/2321', 'uk.org.publ...","{('1992-04-09', '1997-04-08'): 'Pembroke', ('1...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
4,uk.org.publicwhip/person/10005,Bob,Ainsworth,"['uk.org.publicwhip/member/2323', 'uk.org.publ...","{('1992-04-09', '1997-04-08'): 'Coventry North...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
...,...,...,...,...,...,...
13971,uk.org.publicwhip/person/26079,Nick,Mathison,['uk.org.publicwhip/member/90815'],{},{}
13972,uk.org.publicwhip/person/26080,Kate,Nicholl,['uk.org.publicwhip/member/90822'],{},{}
13973,uk.org.publicwhip/person/26081,Patricia,O'Lynn,['uk.org.publicwhip/member/90824'],{},{}
13974,uk.org.publicwhip/person/26082,Alan,Robinson,['uk.org.publicwhip/member/90829'],{},{}


In [7]:
# Using the dictionaries for parties and constituencies, we can find a 'speech_party' and 'speech_constituency'
# These are the party/constituency of the member at the time of giving that particular speech

def get_speech_variables(row):    
    # Identify speech date
    speech_date = row.date
    
    # First check if constituencies dict is empty - if so, speech_constituency is None
    if pd.isna(row.constituencies):
        speech_constituency = None
    else: # Then, ensure constituencies dict is a dict, not a string
        if isinstance(row.constituencies, str):
            constituencies = eval(row.constituencies)
        else:
            constituencies = row.constituencies
        # Finally, find the constituency entry in the dict where the speech date falls between the start and end dates - this is the speech constituency
        speech_constituency = [constituency for (start_date,end_date),constituency in constituencies.items() if speech_date>=start_date and speech_date<=end_date]
    
    # Repeat for parties
    if pd.isna(row.parties):
        speech_party = None
    else:
        if isinstance(row.parties, str):
            parties = eval(row.parties)
        else:
            parties = row.parties
        speech_party = [party for (start_date,end_date),party in parties.items() if speech_date>=start_date and speech_date<=end_date]
    
    if speech_constituency not in [None,[]]:
        speech_constituency = speech_constituency[0]
    else:
        speech_constituency = None

    if speech_party not in [None,[]]:
        speech_party = speech_party[0]
    else:
        speech_party = None
    
    return speech_constituency,speech_party

df = df.merge(people_df, how='left', left_on='questioner_id', right_on='person_id')

speech_constituencies_parties = df.progress_apply(get_speech_variables, axis=1).to_list()
df['questioner_constituency'] = [constituency[0] for constituency in speech_constituencies_parties]
df['questioner_party'] = [party[1] for party in speech_constituencies_parties]
df['questioner_firstname'] = df['first_name']
df['questioner_familyname'] = df['family_name']

df = df.drop(['person_id','first_name','family_name','memberships','constituencies','parties'], axis=1)

df = df.merge(people_df, how='left', left_on='answerer_id', right_on='person_id')

speech_constituencies_parties = df.progress_apply(get_speech_variables, axis=1).to_list()
df['answerer_constituency'] = [constituency[0] for constituency in speech_constituencies_parties]
df['answerer_party'] = [party[1] for party in speech_constituencies_parties]
df['answerer_firstname'] = df['first_name']
df['answerer_familyname'] = df['family_name']

df = df.drop(['person_id','first_name','family_name','memberships','constituencies','parties'], axis=1)
df = df[['date',
         'questioner_id','questioner_name','questioner_firstname','questioner_familyname','questioner_constituency','questioner_party','question_text',
         'answerer_id','answerer_name','answerer_firstname','answerer_familyname','answerer_constituency','answerer_party','answer_text']]

display(df)

100%|███████████████████████████████████████████████████████| 37923/37923 [00:02<00:00, 15693.17it/s]
100%|███████████████████████████████████████████████████████| 37923/37923 [00:02<00:00, 17305.41it/s]


Unnamed: 0,date,questioner_id,questioner_name,questioner_firstname,questioner_familyname,questioner_constituency,questioner_party,question_text,answerer_id,answerer_name,answerer_firstname,answerer_familyname,answerer_constituency,answerer_party,answer_text
0,1946-03-05,uk.org.publicwhip/person/21680,Mr. A. Lewis,Arthur,Lewis,West Ham Upton,,\nasked the Prime Minister if he will consider...,uk.org.publicwhip/person/16522,The Prime Minister (Mr. Attlee),Clement,Attlee,Stepney Limehouse,Labour,\nI should propose on suitable occasions to ma...
1,1946-03-05,,Lieut.-Colonel Dower,,,,,\nWill the right hon. Gentleman give due weigh...,uk.org.publicwhip/person/16522,The Prime Minister,Clement,Attlee,Stepney Limehouse,Labour,\nAnd so what?\n
2,1946-03-05,uk.org.publicwhip/person/21841,Mr. Godfrey Nicholson,Godfrey,Nicholson,Farnham,,\nAs it is impossible to separate entirely the...,uk.org.publicwhip/person/16522,The Prime Minister,Clement,Attlee,Stepney Limehouse,Labour,\nThe general rule is that where there is any ...
3,1951-01-24,uk.org.publicwhip/person/18875,Brigadier Rayner,Ralph,Rayner,Totnes,,\nasked the Prime Minister whether representat...,uk.org.publicwhip/person/16522,The Prime Minister (Mr. Attlee),Clement,Attlee,Walthamstow West,Labour,\nNo. Sir.\n
4,1951-01-24,uk.org.publicwhip/person/18875,Brigadier Rayner,Ralph,Rayner,Totnes,,"\nAs, on this occasion, the right hon. Gentlem...",uk.org.publicwhip/person/16522,The Prime Minister,Clement,Attlee,Walthamstow West,Labour,\nThe hon. and gallant Member is entirely wron...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37918,2022-04-27,uk.org.publicwhip/person/25366,Jeff Smith,Jeff,Smith,"Manchester, Withington",Labour,\nLast night we learned that the Home Secretar...,uk.org.publicwhip/person/10999,Boris Johnson,Boris,Johnson,Uxbridge and South Ruislip,Conservative,\nI believe the Home Office has already made a...
37919,2022-04-27,uk.org.publicwhip/person/25916,Aaron Bell,Aaron,Bell,,,\nNewcastle-under-Lyme Borough Council has sec...,uk.org.publicwhip/person/10999,Boris Johnson,Boris,Johnson,Uxbridge and South Ruislip,Conservative,\nMy hon. Friend is an avid champion for his c...
37920,2022-04-27,uk.org.publicwhip/person/10669,Hilary Benn,Hilary,Benn,Leeds Central,Labour,\nSome 4.5 million people pay for their gas an...,uk.org.publicwhip/person/10999,Boris Johnson,Boris,Johnson,Uxbridge and South Ruislip,Conservative,\nWe are working with Ofgem and all the compan...
37921,2022-04-27,uk.org.publicwhip/person/25910,Ben Everitt,Ben,Everitt,,,"\nWithin the past hour or so, it has been repo...",uk.org.publicwhip/person/10999,Boris Johnson,Boris,Johnson,Uxbridge and South Ruislip,Conservative,\nIt is no disrespect to those who have not be...


In [8]:
df.to_csv('hansard_pmqs.csv')