In [8]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [9]:
hansard_hong_kong_df = pd.read_csv('hansard_hong_kong.csv')
scrape_results_df = pd.read_csv('scrape_results.csv')

merged_df = pd.merge(
    hansard_hong_kong_df,
    scrape_results_df,
    how = 'left',
    left_on = 'gid',
    right_on = 'html_file_name',
    suffixes = ('', '_collasped')
)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16574 entries, 0 to 16573
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gid                   16574 non-null  object 
 1   hdate                 16574 non-null  object 
 2   htime                 10782 non-null  object 
 3   section_id            16570 non-null  float64
 4   subsection_id         16570 non-null  float64
 5   htype                 16570 non-null  float64
 6   major                 16574 non-null  object 
 7   minor                 16570 non-null  float64
 8   person_id             16571 non-null  object 
 9   hpos                  16574 non-null  int64  
 10  epobject_id           16570 non-null  float64
 11  body                  15649 non-null  object 
 12  collapsed             16570 non-null  float64
 13  relevance             16574 non-null  int64  
 14  extract               16574 non-null  object 
 15  listurl            

In [10]:
merged_df = merged_df.drop_duplicates()

major_dict = {
    '1': 'Commons debates',
    '101': 'Lords debates',
    '2': 'Westminster Hall debates',
    '3': 'Written Answers',
    '4': 'Written Ministerial Statements',
    '5': 'Northern Ireland Assembly debates',
    '6': 'Public Bill Committees',
    '7': 'Scottish Parliament debates',
    '8': 'Scottish Parliament written answers',
    '9': 'Questions to the Mayor of London',
    '10': 'Welsh Parliament record',
    '11': 'Senedd Cymru Cofnod'
}
merged_df['debate_type'] = merged_df['major'].map(major_dict)
merged_df['debate_type'].fillna('Other', inplace=True)

minor_dict = {
    0: 'Not applicable',
    1: 'Question',
    2: 'Answer'
}
merged_df['written_type'] = merged_df['minor'].map(minor_dict)
merged_df['written_type'].fillna('Other', inplace=True)

def extract_text_from_html(html_content):
    if pd.notna(html_content) and isinstance(html_content, str):
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup.get_text()
    else:
        return ''
merged_df['body'] = merged_df['body'].apply(extract_text_from_html)
merged_df['speech_body'] = np.where(
    merged_df['collapsed'] == 0,
    merged_df['body'],
    merged_df['body_collasped']
)
merged_df = merged_df[(merged_df['speech_body'] != '') & (merged_df['speech_body'].notna())]


merged_df['full_url'] = 'https://www.theyworkforyou.com/' + merged_df['listurl']

merged_df['relevant_speeches'] = merged_df['collapsed'] + 1

merged_df['speaker_id'] = np.where(
    merged_df['collapsed'] == 0,
    merged_df['person_id'],
    merged_df['person_id_collasped']
)
merged_df['speaker_id'] = merged_df['speaker_id'].str.replace('Not found', 'Not found/Not an MP')
merged_df['speaker_id'] = np.where(
    (merged_df['speaker_id'].str.match(r'^\d{1,4}$')) | (merged_df['speaker_id'].str.contains('wikipedia')),
    'Not found/Not an MP',
    merged_df['speaker_id']
)

merged_df = merged_df.drop(['htime', 'section_id', 'subsection_id', 'htype', 'major', 'minor', 'person_id', 'hpos', 'epobject_id', 'body', 'collapsed', 'relevance', 'extract', 'listurl', 'speaker_member_id', 'speaker_name', 'speaker_house', 'speaker_constituency', 'speaker_party', 'speaker_person_id', 'speaker_url', 'speaker_office', 'parent_listurl', 'person_id_collasped', 'body_collasped'], axis = 1)

merged_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16436 entries, 0 to 16573
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gid                16436 non-null  object 
 1   hdate              16436 non-null  object 
 2   parent_body        16436 non-null  object 
 3   file_name          16436 non-null  object 
 4   html_file_name     12169 non-null  object 
 5   debate_type        16436 non-null  object 
 6   written_type       16436 non-null  object 
 7   speech_body        16436 non-null  object 
 8   full_url           16436 non-null  object 
 9   relevant_speeches  16436 non-null  float64
 10  speaker_id         16436 non-null  object 
dtypes: float64(1), object(10)
memory usage: 1.5+ MB


In [11]:
merged_df.to_csv('all_speeches.csv', index=False)