In [1]:
# Import packages
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import chardet
import traceback

In [2]:
# Define functions
def detect_encoding(file_path):
    """Detect and return the encoding of a file."""
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        return chardet.detect(raw_data)['encoding']

def read_html_content(file_path, encoding):
    """Open and return the content of an htm or html file."""
    with open(file_path, 'r', encoding=encoding) as file:
        return file.read()

def extract_and_convert_date(title):
    """Take the title string of an html.
    Extract the date at the end of the string.
    Supports two date formats: 'Month DD, YYYY' and 'DD.MM.YYYY' or 'DD.MM.YY'.
    In the case of two-digit years, years < 90 are considered 2000s, otherwise 1900s.
    Return the data as a pandas datetime object."""
    pattern1 = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b'
    pattern2 = r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b'
    match1 = re.search(pattern1, title)
    match2 = re.search(pattern2, title)
    if match1:
        date_str = match1.group(0)
        return pd.to_datetime(date_str)
    elif match2:
        date_str = match2.group(0)
        if len(date_str.split('.')[-1]) == 2:
            if int(date_str[-2:]) < 90:
                year_str = '20' + date_str[-2:]
            else:
                year_str = '19' + date_str[-2:]
            date_str = date_str[:-3] + '.' + year_str
        return pd.to_datetime(date_str, dayfirst=True)
    else:
        return None

def extract_date_from_title(filename, soup):
    """Extract and return a date object from the title of the soup object."""
    title_exceptions = {
        'P200908310145.htm':'July 31, 2009',
        '1231116.htm':'November 30, 2001',
        '0331166.htm':'February 28, 1999',
        '1130211.htm':'October 31, 2001',
        '1031118.htm':'September 30, 2000',
        '0228134.htm':'January 31, 2001',
        '1031041.htm':'September 30, 1998',
        '0629227.htm':'May 31, 2001',
        'P200907310169.htm':'June 30, 2009',
        '0430173.htm':'March 31, 1999',
        '1231131.htm':'November 30, 1998',
        '0929278.htm':'August 31, 2000',
        '1130109.htm':'October 31, 1998',
        '0130063.htm':'December 31, 1998'
        }
    if filename in title_exceptions:
        title = title_exceptions[filename]
    else:
        title = soup.find('title').get_text(strip=True)
    return extract_and_convert_date(title)

def convert_to_negative_if_parentheses(num_str):
    """Take a string and convert it to a negative float if it is wrapped in ()."""
    if num_str in ['', '-', '()']:
        return None
    try:
        clean_str = num_str.replace(',', '')
        if clean_str.startswith('(') and clean_str.endswith(')'):
            return -float(clean_str[1:-1])
        return float(clean_str)
    except ValueError:
        return None

def count_words(string):
    """Take a string and return the number of words in the string, while ignoring the multiple white spaces between the words."""
    words = re.split(r'\s+', string.strip())
    return len(words)

def extract_data_from_content(content, data_dict):
    """Extract financial data from files that do not organise financial data with <table>.
    Although the files all display the financial data line by line in a table-like format when opened in a browser, different years use different html tags to separate each line.
    Append new data to the dictionary.
    """
    for seperator in ['<br/>', '<pre>', '<br />', '<p>']:
        parts = content.split(seperator)
        # When the line of text containing fiscal reserves information does not specify whether it belongs to the start or the end of the period, the first appearance is assigned to the start, and the second appearance is assigned to the end.
        fiscal_reserve_count = 0
        for string_multiline in parts:
            string_line = string_multiline.splitlines()
            for string in string_line:
                # Each line undergoes a series a transformation until they become a string with the first word as the key and the second word as the value.
                string = string.replace('&nbsp;', ' ')
                note_pattern = r'\([Nn]ote \d+\)'
                string = re.sub(note_pattern, '', string)
                if 'Surplus after' in string:
                    string = string.replace('Surplus after', 'surplus')
                if 'Deficit after' in string:
                    string = string.replace('Deficit after', 'deficit')
                for i in ['<br/>', '<pre>', '<br />', '<p>', '_', '</pre>']:
                    string = string.replace(i, '')
                # Very often, 'Fiscal Reserves at start/end of period' is split into two lines to maintain the table look. However, different files split the line at different points, and while some place the value after the first line, some others place the value after the second line.
                if fiscal_reserve_count == 0 and ('Fiscal Reserves' in string or 'of period' in string or 'at start' in string) and 'table' not in string.lower():
                    string = string.replace('Fiscal Reserves at start of period', 'fiscal_reserve_start')
                    string = string.replace('Fiscal Reserves at start of', 'fiscal_reserve_start')
                    string = string.replace('Fiscal Reserves at start', 'fiscal_reserve_start')
                    string = string.replace('Fiscal Reserves at', 'fiscal_reserve_start')
                    string = string.replace('Fiscal Reserves', 'fiscal_reserve_start')
                    string = string.replace('at start of period', 'fiscal_reserve_start')
                    string = string.replace('at start of', 'fiscal_reserve_start')
                    string = string.replace('start of period', 'fiscal_reserve_start')
                    string = string.replace('of period', 'fiscal_reserve_start')
                    if count_words(string) > 1:
                        fiscal_reserve_count += 1
                elif fiscal_reserve_count == 1 and ('Fiscal Reserves' in string or 'of period' in string or 'end of' in string):
                    string = string.replace('Fiscal Reserves at end of period', 'fiscal_reserve_end')
                    string = string.replace('Fiscal Reserves at end', 'fiscal_reserve_end')
                    string = string.replace('Fiscal Reserves at', 'fiscal_reserve_end')
                    string = string.replace('Fiscal Reserves', 'fiscal_reserve_end')
                    string = string.replace('at end of period', 'fiscal_reserve_end')
                    string = string.replace('end of period', 'fiscal_reserve_end')
                    string = string.replace('of period', 'fiscal_reserve_end')
                if count_words(string) in [2,3,4]:
                    words = re.split(r'\s+', string.strip())
                    key = words[0]
                    value_str = words[1]
                    value = convert_to_negative_if_parentheses(value_str)
                    if key and (type(value) == int or type(value) == float):
                        data_dict[key] = value
    return data_dict

def extract_data_from_table(soup, data_dict):
    """Extract financial data from files that organise financial data with <table>.
    Append new data to the dictionary."""
    tables = soup.find_all('table')
    for table in tables:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) > 1:
                key = cells[0].get_text(strip=True)
                note_pattern = r'\([Nn]ote \d+\)'
                key = re.sub(note_pattern, '', key)
                value_str = cells[1].get_text(strip=True)
                value = convert_to_negative_if_parentheses(value_str)
                if key and (type(value) == int or type(value) == float):
                    data_dict[key] = value
    return data_dict

def append_manual_extractions(filename, data_dict):
    """The above code still fails to extract financial data from some files that are structured uniquely.
    This function appends the missing data manually."""
    if filename == '0430184.htm':
        data_dict['revenue'] = 11251.9
        data_dict['expenditure'] = -24866.9
        data_dict['surplus_deficit'] = -13615.0
        data_dict['fiscal_reserve_start'] = 380572.6
        data_dict['fiscal_reserve_end'] = 372513.7
    if filename == '1231131.htm':
        data_dict['fiscal_reserve_start'] = 415684.2
        data_dict['fiscal_reserve_end'] = 430285.6
    if filename == '0131169.htm':
        data_dict['fiscal_reserve_start'] = 375345.7
        data_dict['fiscal_reserve_end'] = 369493.4
    if filename == 'P200607310137.htm':
        data_dict['fiscal_reserve_start'] = 312179.9
        data_dict['fiscal_reserve_end'] = 304689.5
    if filename == 'P201604290810.htm':
        data_dict['fiscal_reserve_start'] = 861722.8
        data_dict['fiscal_reserve_end'] = 842887.4
    if filename == '1031041.htm':
        data_dict['proceed_repayment'] = 6278.00
    if filename == '1130109.htm':
        data_dict['proceed_repayment'] = 5331.70
    if filename == '0428178.htm':
        data_dict['proceed_repayment'] = -6143.00
    if filename == '0430184.htm':
        data_dict['proceed_repayment'] = 5556.10
    if filename == '0430197.htm':
        data_dict['proceed_repayment'] = 586.9
    if filename == '04270217.htm':
        data_dict['proceed_repayment'] = -717.6
    if filename == 'P200604290079.htm':
        data_dict['proceed_repayment'] = 717.6
    if filename == 'P201408290625.htm':
        data_dict['proceed_repayment'] = -9687.8
        data_dict['deficit'] = -2287.5
    return data_dict

def convert_dictionary_to_dataframe(data_dict):
    """Take the data dictionary and return a data frame after standardising the data structure."""
    new_data = pd.DataFrame([data_dict])
    for col in new_data.columns:
        new_data.rename(columns={col:col.lower().replace(" ", "")}, inplace=True)
    # Drop unnecessary columns
    columns_to_remove = ['feb', 'sector', '<pre>table', 'total:', 'sept', '--', 'period', '(notes', '<p>total', '<p>table', 'jan', 'oct', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'table', 'total']
    new_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
    columns_to_drop = [col for col in new_data.columns if ('banking' in col.lower()) or (type(convert_to_negative_if_parentheses(col)) == float)]
    new_data.drop(columns=columns_to_drop, inplace=True)
    # Rename columns to standardised names
    for col in new_data.columns:
        if ('surplus' in col or 'deficit' in col) and 'consolidated' not in col:
            new_data.rename(columns={col:'surplus_deficit'}, inplace=True)
        elif 'fiscalreserve' in col and 'startofperiod' in col:
            new_data.rename(columns={col:'fiscal_reserve_start'}, inplace=True)
        elif 'fiscalreserve' in col and 'endofperiod' in col:
            new_data.rename(columns={col:'fiscal_reserve_end'}, inplace=True)
        elif col == '<p>revenue':
            new_data.rename(columns={col:'revenue'}, inplace=True)
        elif col == '<p>expenditure':
            new_data.rename(columns={col:'expenditure'}, inplace=True)
    # Drop duplicated columns
    new_data = new_data.T.drop_duplicates().T
    return new_data


In [3]:
# Loop through each html file, extract financial data, and append it to a data frame.
directory = 'financial_results_htmls'
df = pd.DataFrame()
for filename in tqdm(os.listdir(directory), desc='Processing'):
    if filename.endswith(".htm") or filename.endswith(".html"):
        try:
            file_path = os.path.join(directory, filename)
            encoding = detect_encoding(file_path)
            content = read_html_content(file_path, encoding)
            soup = BeautifulSoup(content, 'html.parser')
            date = extract_date_from_title(filename, soup)
            data_dict = {'file_name': filename,'month_ended':date}
            data_dict = extract_data_from_content(content, data_dict)
            data_dict = extract_data_from_table(soup, data_dict)
            data_dict = append_manual_extractions(filename, data_dict)
            new_data = convert_dictionary_to_dataframe(data_dict)
            df = pd.concat([df, new_data], ignore_index=True)
        except Exception as e:
            print(filename)
            print(f"An error occurred: {e}")
            traceback.print_exc()

# Print a summary of the data frame
df = df.sort_values('month_ended')
df.info()

Processing: 100%|██████████| 248/248 [00:06<00:00, 36.51it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 97 to 93
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   file_name             247 non-null    object        
 1   month_ended           247 non-null    datetime64[ns]
 2   revenue               247 non-null    object        
 3   expenditure           247 non-null    object        
 4   surplus_deficit       247 non-null    object        
 5   fiscal_reserve_start  247 non-null    object        
 6   fiscal_reserve_end    247 non-null    object        
 7   proceed_repayment     8 non-null      object        
dtypes: datetime64[ns](1), object(7)
memory usage: 17.4+ KB





In [4]:
# Export the data frame as a csv file
df.to_csv('extraction_result.csv', index=False)