This notebook performs the primary preparation of OCR content generated in Microsoft AI Studio and manually added to the repo in json format.

In [11]:
import os, sys, re, json, re, logging

# Manually set the root directory path
root_dir = '../../'
sys.path.append(root_dir)

from exclusions import excluded_strings, excluded_patterns

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Get the root logger
logger = logging.getLogger()

# If the logger has handlers, remove them
if logger.hasHandlers():
    logger.handlers.clear()

# Create a new file handler that overwrites the log file each time
file_handler = logging.FileHandler('exclusions.log', mode='w')
file_handler.setLevel(logging.INFO)

# Create a logging format
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(file_handler)

# Log the excluded strings and patterns
logger.info(f"Excluded strings: {excluded_strings}, Excluded patterns: {excluded_patterns}")

In [12]:
# Use the OCR content (manually produced in and downloaded from Microsoft AI Studio) and associated positional data to generate page files from the two column layout, removing digitization metadata.
# Needs to be performed manually for each session, defining the source file name here and running all remaining cells before repeating.

# Define the primary source file you're working with
hansard_source_file_name = '4.September26-November07-1901'  # replace with your file name

# Concatenate hansard_source_file to source_dir and output_dir_path
source_dir_path = os.path.join('../../source', hansard_source_file_name)
output_pages_dir_path = os.path.join('../../output', 'pages', hansard_source_file_name)
output_sessions_dir_path = os.path.join('../../output', 'sessions', hansard_source_file_name)

# Check if source directory exists
if not os.path.exists(source_dir_path):
    os.makedirs(source_dir_path)

# Check if output directory exists
if not os.path.exists(output_pages_dir_path):
    os.makedirs(output_pages_dir_path)


# Get all files in the source directory
files_in_directory = os.listdir(source_dir_path)

# Filter for json files
json_files = [file for file in files_in_directory if file.endswith('.json')]

# Now you can iterate over the json_files list and process each file
for json_file in json_files:
    input_source_file = os.path.join(source_dir_path, json_file)

    # Load the JSON file
    with open(input_source_file) as f:
        data = json.load(f)

    # Access the "analyzeResult"
    analyze_result = data['analyzeResult']

    # Iterate over each page
    for page in analyze_result['pages']:
        # Create a variable to store the page content
        page_content = ""

        # Calculate the x-coordinate that separates the two columns
        column_separator = page['width'] / 2

        # Divide the text lines into two groups based on their x-coordinate
        left_column_lines = [line for line in page['lines'] if min(line['polygon'][::2]) < column_separator]
        right_column_lines = [line for line in page['lines'] if min(line['polygon'][::2]) >= column_separator]

        # Sort each group of text lines by their y-coordinate (top to bottom)
        left_column_lines.sort(key=lambda line: min(line['polygon'][1::2]))
        right_column_lines.sort(key=lambda line: min(line['polygon'][1::2]))

        # Combine the sorted text lines from the two columns
        sorted_lines = left_column_lines + right_column_lines


        # Define the areas of the page you want to exclude
        x_range = (0, 0.1 * page['width'])  # Capture the left ~10% of the page. Callibrate to suit.
        y_range = (0.95 * page['height'], page['height'])  # Capture the bottom ~10% of the page. Callibrate to suit.
        header_y_range = (0, 0.1 * page['height'])  # Capture the top ~10% of the page. Callibrate to suit.

        # Iterate over each text line in the page
        for line in sorted_lines:
            # Check if any of the y-coordinates of the line fall within the y_range
            y_coordinates = line['polygon'][1::2]
            if any(y_range[0] <= y <= y_range[1] for y in y_coordinates) or any(header_y_range[0] <= y <= header_y_range[1] for y in y_coordinates):
                # If y-coordinate condition is met, log the line and continue to next iteration
                logging.info(f"Excluded due to y_range or header_y_range: {line}")
                continue

            # If condition is false, add the line to the page content
            content = line['content']
            page_content += content + " "

        # Write the page content to a new file in the output directory
        output_file_path = os.path.join(output_pages_dir_path, f'page_{page["pageNumber"]}.md')
        with open(output_file_path, 'w') as f:
            f.write(page_content)

In [13]:
# Combine the pages into a single master content file, for cleaning.

combined_file = os.path.join(output_pages_dir_path, 'all_pages_master.txt')

def sort_key(filename):
    match = re.search(r'page_(\d+)', filename)
    if match:
        return int(match.group(1))
    return float('inf')

with open(combined_file, 'w') as outfile:
    for filename in sorted(os.listdir(output_pages_dir_path), key=sort_key):
        if filename.startswith('page_') and filename.endswith('.md'):
            page_number = filename.replace('page_', '').replace('.md', '')
            outfile.write(f'<page:{page_number}>\n')
            with open(os.path.join(output_pages_dir_path, filename), 'r') as infile:
                for line in infile:
                    outfile.write(line.lstrip())  # strip leading whitespace
                outfile.write('\n')


In [14]:
# Clean the master content file, using local processing

# Read the content
with open(combined_file, 'r') as file:
    lines = file.readlines()

# Remove patterns matching excluded patterns
for pattern in excluded_patterns:
    lines = [re.sub(pattern, '', line) for line in lines]

# Remove strings matching excluded strings
for string in excluded_strings:
    logging.info(f"Removing string: {string}")
    lines = [line.replace(string, '') for line in lines]

# Remove unwanted indentation
lines = [line.lstrip() for line in lines]

# Write the cleaned content back
master_file_path = os.path.join(output_pages_dir_path, 'all_pages_master.txt')
with open(master_file_path, 'w') as file:
    file.writelines(lines)

In [15]:
# Split the cleaned master file into sessions, adding page tags.

# Open the master file
with open(master_file_path, 'r') as file:
    # Read the content
    content = file.read()

    # Define the pattern for 'LEGISLATIVE COUNCIL.' followed by a date
    pattern = r'(LEGISLATIVE COUNCIL\.\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), (?:\d{1,2}|[IVX]{1,3})(?:st|nd|rd|th) \w+, \d{4})'

    # Find all matches of the pattern
    matches = re.findall(pattern, content)

    # Split the content based on the pattern
    parts = re.split(pattern, content)

    # Iterate over the parts, skipping the first
    for i in range(1, len(parts), 2):
        # Extract the date string following 'LEGISLATIVE COUNCIL.'
        date_match = re.search(r'(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), (?:\d{1,2}|[IVX]{1,3})(?:st|nd|rd|th) \w+, \d{4}', parts[i])
        date = date_match.group() if date_match else 'No date'

        # Format the filename
        filename = f'{i//2+1}-{date}.txt'

        # Concatenate output_dir_path with subdirectory and filename
        file_path = os.path.join(output_sessions_dir_path, filename)

        # Ensure the directory exists
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # Check if the part starts with a <page> tag
        if not parts[i].strip().startswith('<page:'):
            # Find the first <page> tag in the part
            page_tag_match = re.search(r'<page:(\d+)>', parts[i+1])
            if page_tag_match:
                # Derive the first <page> tag
                first_page_number = int(page_tag_match.group(1)) - 1
                first_page_tag = f'<page:{first_page_number}>\n'
                # Prepend the first <page> tag to the part
                parts[i] = first_page_tag + parts[i]

        # Open the file using the variable
        with open(file_path, 'w') as file:
            # Write the matched pattern and the following part to the file
            file.write(parts[i] + parts[i+1])