In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import random
import time

In [2]:
def download_and_parse_as_soup(url, file_name):
    """
    Check if an html file for the section is already downloaded.
    If yes, parse that file as a soup object.
    If not, download that file and parse the response as a soup object.
    """
    filepath = os.path.join('downloaded_htmls', f"{file_name}.html")
    try:
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
            return soup
        else:
            time.sleep(random.uniform(3, 8))
            print('Requesting ', url)
            response = requests.get(url)
            if response.status_code == 200:
                print('Request successful')
                with open(filepath, 'w', encoding='utf-8') as file:
                    file.write(response.text)
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup
            else:
                print(f"Failed to retrieve {url}: Status code {response.status_code}")
                return None
    except requests.RequestException as e:
        print(f"Error during requests to {url} : {str(e)}")
        return None

def extract_df_from_debate(soup, html_file_name):
    """
    Take a soup object downloaded from www.theyworkforyou.com/debates.
    Extract all person IDs and speech content that contains "Hong Kong".
    Return a dataframe.
    """
    person_ids = []
    bodies = []
    speech_divs = soup.find_all('div', class_='debate-speech__speaker-and-content')
    for div in speech_divs:
        try:
            person_id = div.find('a')['href'].split('=')[-1]
        except:
            person_id = 'Not found'
        body_div = div.find('div', class_='debate-speech__content')
        # Add space before and after linked text
        for a_tag in body_div.find_all('a'):
            a_tag.replace_with('_SPACE_' + a_tag.get_text() + '_SPACE_')
        body_text = ' '.join(body_div.get_text(strip=True).split())
        body_text = body_text.replace('_SPACE_', ' ')
        if 'Hong Kong' in body_text:
            person_ids.append(person_id)
            bodies.append(body_text)
    df = pd.DataFrame({'person_id': person_ids, 'body': bodies})
    df['html_file_name'] = html_file_name
    return df

def append_new_data_to_scrape_results(hansard_hong_kong_df, start_date, end_date):
    try:
        scrape_results_df = pd.read_csv('intermediate_outputs/scrape_results.csv')
    except:
        scrape_results_df = pd.DataFrame(columns=['html_file_name', 'person_id', 'body'])
    hansard_hong_kong_df_filtered = hansard_hong_kong_df[
        (hansard_hong_kong_df['hdate'] >= start_date) &
        (hansard_hong_kong_df['hdate'] <= end_date) &
        (hansard_hong_kong_df['collapsed'] > 0)]
    process_total = hansard_hong_kong_df_filtered.shape[0]
    process_count = 0
    for index, row in hansard_hong_kong_df_filtered.iterrows():
        process_count += 1
        print('Processing', process_count, 'of', process_total,':', row['gid'])
        url = 'https://www.theyworkforyou.com' + row['listurl']
        html_file_name = row['gid']
        if not (scrape_results_df['html_file_name'] == html_file_name).any():
            soup = download_and_parse_as_soup(url, html_file_name)
            if soup:
                new_data_df = extract_df_from_debate(soup, html_file_name)
                scrape_results_df = pd.concat([scrape_results_df, new_data_df], ignore_index=True)
                print('Appended ', html_file_name)
            else:
                print('Process failed:', url)
    scrape_results_df.to_csv('intermediate_outputs/scrape_results.csv', index=False)

In [3]:
hansard_hong_kong_df = pd.read_csv('intermediate_outputs/hansard_hong_kong.csv', parse_dates=['hdate'])
start_date = pd.Timestamp('1800-01-01')
end_date = pd.Timestamp('2024-12-31')

append_new_data_to_scrape_results(hansard_hong_kong_df, start_date, end_date)
result_df = pd.read_csv('intermediate_outputs/scrape_results.csv')
result_df.info()

Processing 1 of 3310 : 1938-04-27a.114.4
Processing 2 of 3310 : 1938-04-06a.317.0
Processing 3 of 3310 : 1938-03-16a.381.12
Processing 4 of 3310 : 1938-03-15a.181.1
Processing 5 of 3310 : 1938-03-09a.1873.4
Processing 6 of 3310 : 1938-03-09a.1879.6
Processing 7 of 3310 : 1938-03-09a.1914.4
Processing 8 of 3310 : 1938-02-08a.813.2
Processing 9 of 3310 : 1937-12-21a.1769.1
Processing 10 of 3310 : 1937-12-21a.1778.14
Processing 11 of 3310 : 1937-12-21a.1793.3
Processing 12 of 3310 : 1937-12-20a.1578.2
Processing 13 of 3310 : 1937-12-13a.811.2
Processing 14 of 3310 : 1937-12-08a.361.4
Processing 15 of 3310 : 1937-12-08a.364.7
Processing 16 of 3310 : 1937-12-06a.8.0
Processing 17 of 3310 : 1937-12-06a.30.6
Processing 18 of 3310 : 1937-11-17a.379.4
Processing 19 of 3310 : 1937-11-10a.1760.3
Processing 20 of 3310 : 1937-11-03a.922.3
Processing 21 of 3310 : 1937-11-01a.633.1
Processing 22 of 3310 : 1937-10-28a.292.1
Processing 23 of 3310 : 1937-06-28a.1631.8
Processing 24 of 3310 : 1937-06-02a