In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Hansard parser from XML files saved to disk

In [2]:
def read_xml_from_path(path):
    # Read from path - decode bytes to strings
    with open(path, 'rb') as file:
        xml = file.read().decode(errors='replace') # Replace unrecognised utf-8 tokens with '?'
        
    return xml

In [3]:
def parse_hansard_xml(xml, memberid2personid_lookup, filter_no_speaker=True):
    # Use BeautifulSoup to tidy the XML
    import pandas as pd
    from bs4 import BeautifulSoup
    bs = BeautifulSoup(xml)
    
    # Define a function to parse a chunk of XML for an individual speech
    def parse_speech_xml(speech_xml):
        name = speech_xml.get('speakername')
        speech_id = speech_xml.get('id')
        person_id = speech_xml.get('person_id')
        if pd.isna(person_id):
            member_id = speech_xml.get('speakerid')
            if member_id in memberid2personid_lookup.keys():
                person_id = memberid2personid_lookup[member_id]
        text = speech_xml.find('p')
        if text is not None:
            text = text.get_text()
        return name,speech_id,person_id,text
    
    # Get an XML object for each speech in the debate
    speeches = bs.find_all('speech') 
    
    # Create dictionary to store debate features
    debate_dict = {'name':[],
                   'speech_id':[],
                   'person_id':[],
                   'text':[]}
    
    for speech_xml in speeches:
        name,speech_id,person_id,text = parse_speech_xml(speech_xml)
        if filter_no_speaker: # If we're filtering 'no speaker' lines, then drop speeches with no name
            if name is not None:
                for variable in debate_dict.keys():
                    debate_dict[variable].append(eval(variable))
        else:
            for variable in debate_dict.keys():
                debate_dict[variable].append(eval(variable))
    
    return pd.DataFrame(debate_dict)

In [4]:
people_df = pd.read_csv('people.csv').drop('Unnamed: 0',axis=1)

lookup = lambda row: {member_id:row.person_id for member_id in eval(row.memberships)}
lookup_list = people_df.progress_apply(lookup, axis=1).to_list()

memberid2personid_lookup = {member_id:person_id for lookup_dict in lookup_list for member_id,person_id in lookup_dict.items()}

target_urls = pd.read_csv('debate_urls.csv').url.to_list()

df = parse_hansard_xml('', memberid2personid_lookup)

for url in tqdm(target_urls):
    filename = 'debates_xml/'+url.split('/')[-1]
    
    speech_xml = read_xml_from_path(filename)
    debate_df = parse_hansard_xml(speech_xml, memberid2personid_lookup)
    
    df = df.append(debate_df)
    
display(df.head())

100%|█████████████████████████████████████████████████████████████████████████| 14004/14004 [00:00<00:00, 22370.32it/s]
100%|██████████████████████████████████████████████████████████████████████████| 18280/18280 [1:50:43<00:00,  2.75it/s]


Unnamed: 0,name,speech_id,person_id,text
0,Colonel F. B. MILDMAY (addressing himself to t...,uk.org.publicwhip/debate/1919-02-04a.3.1,uk.org.publicwhip/person/15282,"Sir Courtenay Ilbert, in accordance with the G..."
1,Sir HENRY DALZIEL,uk.org.publicwhip/debate/1919-02-04a.5.0,unknown,"Sir Courtenay Ilbert, I am fully conscious tha..."
2,Mr. LOWTHER (who was received with general che...,uk.org.publicwhip/debate/1919-02-04a.8.0,unknown,"Sir Courtenay Ilbert,—In accordance with the a..."
3,Mr. SPEAKER-ELECT (standing on the upper step ...,uk.org.publicwhip/debate/1919-02-04a.11.0,unknown,"Before taking the Chair, once more as Speaker ..."
4,Mr. BONAR LAW (Leader of the House),uk.org.publicwhip/debate/1919-02-04a.11.3,uk.org.publicwhip/person/20693,"Mr. Speaker-Elect,—In the unavoidable absence ..."


In [5]:
date_from_speech_id = lambda x: pd.to_datetime(x.split('/')[-1][:10])

df['speech_date'] = df.speech_id.progress_apply(lambda x: date_from_speech_id(x))

100%|█████████████████████████████████████████████████████████████████████| 5757506/5757506 [05:37<00:00, 17037.66it/s]


In [6]:
display(df)

Unnamed: 0,name,speech_id,person_id,text,speech_date
0,Colonel F. B. MILDMAY (addressing himself to t...,uk.org.publicwhip/debate/1919-02-04a.3.1,uk.org.publicwhip/person/15282,"Sir Courtenay Ilbert, in accordance with the G...",1919-02-04
1,Sir HENRY DALZIEL,uk.org.publicwhip/debate/1919-02-04a.5.0,unknown,"Sir Courtenay Ilbert, I am fully conscious tha...",1919-02-04
2,Mr. LOWTHER (who was received with general che...,uk.org.publicwhip/debate/1919-02-04a.8.0,unknown,"Sir Courtenay Ilbert,—In accordance with the a...",1919-02-04
3,Mr. SPEAKER-ELECT (standing on the upper step ...,uk.org.publicwhip/debate/1919-02-04a.11.0,unknown,"Before taking the Chair, once more as Speaker ...",1919-02-04
4,Mr. BONAR LAW (Leader of the House),uk.org.publicwhip/debate/1919-02-04a.11.3,uk.org.publicwhip/person/20693,"Mr. Speaker-Elect,—In the unavoidable absence ...",1919-02-04
...,...,...,...,...,...
170,George Freeman,uk.org.publicwhip/debate/2022-05-10a.125.3,uk.org.publicwhip/person/24817,The hon. Gentleman makes an important point. S...,2022-05-10
171,Jim Shannon,uk.org.publicwhip/debate/2022-05-10a.128.0,uk.org.publicwhip/person/13864,The Minister referred to Queen’s University Be...,2022-05-10
172,George Freeman,uk.org.publicwhip/debate/2022-05-10a.128.1,uk.org.publicwhip/person/24817,The hon. Gentleman makes an important point an...,2022-05-10
173,Rachael Maskell,uk.org.publicwhip/debate/2022-05-10a.128.2,uk.org.publicwhip/person/25433,This evening’s debate has been excellent. In Y...,2022-05-10


In [7]:
df.to_csv('hansard_in_full.csv')