# Example debate speech and content
<div class="debate-speech__speaker-and-content">
    <h2 class="debate-speech__speaker">
        <a href="/mp/?p=10180">
            <img src="/people-images/mps/10180.jpg" alt="Photo of Iain Duncan Smith">
            <strong class="debate-speech__speaker__name">Iain Duncan Smith</strong>
            <small class="debate-speech__speaker__position">Conservative, Chingford and Woodford
                Green</small>
        </a>
        <a href="/debates/?id=2023-12-18c.1125.0#g1125.1" class="debate-speech__meta__link time">
            3:30,
            18 December 2023 </a>
    </h2>
    <div class="debate-speech__content">
        <p pid="c1125.1/1">(Urgent Question): To ask the <a href="/glossary/?gl=23"
                title="Secretary of State was originally the title given to the two officials who..."
                class="glossary">Secretary of State</a> for Foreign, Commonwealth and Development
            Affairs what steps he is taking to support <a rel="nofollow"
                href="https://en.wikipedia.org/wiki/Jimmy_Lai">Jimmy Lai</a> during his trial and if
            he will call for his immediate and unconditional release.</p>
    </div>
</div>

In [101]:
# Import packages
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import random
import time

In [102]:
# Define functions
def download_and_parse_as_soup(url, file_name):
    """
    Check if an html file for the section is already downloaded.
    If yes, parse that file as a soup object.
    If not, download that file and parse the response as a soup object.
    """
    filepath = os.path.join('downloaded_htmls', f"{file_name}.html")
    try:
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
            return soup
        else:
            time.sleep(random.uniform(3, 8))
            print('Requesting ', url)
            response = requests.get(url)
            if response.status_code == 200:
                print('Request successful')
                with open(filepath, 'w', encoding='utf-8') as file:
                    file.write(response.text)
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup
            else:
                print(f"Failed to retrieve {url}: Status code {response.status_code}")
                return None
    except requests.RequestException as e:
        print(f"Error during requests to {url} : {str(e)}")
        return None

def extract_df_from_debate(soup, html_file_name):
    """
    Take a soup object downloaded from www.theyworkforyou.com/debates.
    Extract all person IDs and speech content that contains "Hong Kong".
    Return a dataframe.
    """
    person_ids = []
    bodies = []
    speech_divs = soup.find_all('div', class_='debate-speech__speaker-and-content')
    for div in speech_divs:
        try:
            person_id = div.find('a')['href'].split('=')[-1]
        except:
            person_id = 'Not found'
        body_div = div.find('div', class_='debate-speech__content')
        # Add space before and after linked text
        for a_tag in body_div.find_all('a'):
            a_tag.replace_with('_SPACE_' + a_tag.get_text() + '_SPACE_')
        body_text = ' '.join(body_div.get_text(strip=True).split())
        body_text = body_text.replace('_SPACE_', ' ')
        if 'Hong Kong' in body_text:
            person_ids.append(person_id)
            bodies.append(body_text)
    df = pd.DataFrame({'person_id': person_ids, 'body': bodies})
    df['html_file_name'] = html_file_name
    return df

def append_new_data_to_scrape_results(hansard_hong_kong_df, start_date, end_date):
    try:
        scrape_results_df = pd.read_csv('scrape_results.csv')
    except:
        scrape_results_df = pd.DataFrame(columns=['html_file_name', 'person_id', 'body'])
    hansard_hong_kong_df_filtered = hansard_hong_kong_df[
        (hansard_hong_kong_df['hdate'] >= start_date) &
        (hansard_hong_kong_df['hdate'] <= end_date) &
        (hansard_hong_kong_df['collapsed'] > 0)]
    process_total = hansard_hong_kong_df_filtered.shape[0]
    process_count = 0
    for index, row in hansard_hong_kong_df_filtered.iterrows():
        process_count += 1
        print('Processing', process_count, 'of', process_total,':', row['gid'])
        url = 'https://www.theyworkforyou.com' + row['listurl']
        html_file_name = row['gid']
        if not (scrape_results_df['html_file_name'] == html_file_name).any():
            soup = download_and_parse_as_soup(url, html_file_name)
            if soup:
                new_data_df = extract_df_from_debate(soup, html_file_name)
                scrape_results_df = pd.concat([scrape_results_df, new_data_df], ignore_index=True)
                print('Appended ', html_file_name)
            else:
                print('Process failed:', url)
    scrape_results_df.to_csv('scrape_results.csv', index=False)

In [103]:
hansard_hong_kong_df = pd.read_csv('hansard_hong_kong.csv', parse_dates=['hdate'])
start_date = pd.Timestamp('1900-01-01')
end_date = pd.Timestamp('2000-12-31')

append_new_data_to_scrape_results(hansard_hong_kong_df, start_date, end_date)
result_df = pd.read_csv('scrape_results.csv')
result_df.info()

Processing 1 of 1709 : 1938-04-27a.114.4
Requesting  https://www.theyworkforyou.com/debates/?id=1938-04-27a.114.4&amp;s=Hong+Kong
Request successful
Appended  1938-04-27a.114.4
Processing 2 of 1709 : 1938-04-06a.317.0
Requesting  https://www.theyworkforyou.com/debates/?id=1938-04-06a.316.0&amp;s=Hong+Kong#g317.0
Request successful
Appended  1938-04-06a.317.0
Processing 3 of 1709 : 1938-03-16a.381.12
Requesting  https://www.theyworkforyou.com/debates/?id=1938-03-16a.381.11&amp;s=Hong+Kong#g381.12
Request successful
Appended  1938-03-16a.381.12
Processing 4 of 1709 : 1938-03-15a.181.1
Requesting  https://www.theyworkforyou.com/debates/?id=1938-03-15a.181.0&amp;s=Hong+Kong#g181.1
Request successful
Appended  1938-03-15a.181.1
Processing 5 of 1709 : 1938-03-09a.1873.4
Requesting  https://www.theyworkforyou.com/debates/?id=1938-03-09a.1873.4&amp;s=Hong+Kong
Request successful
Appended  1938-03-09a.1873.4
Processing 6 of 1709 : 1938-03-09a.1879.6
Requesting  https://www.theyworkforyou.com/de