This notebook processes OCR content contained in .json files, generated in Microsoft AI Studio and manually added to the 'source' directory.

In [1]:
# Import modules and set variables and directories.
import os, sys, re, json, re, logging

# Manually set the root directory path
root_dir = '../'
sys.path.append(root_dir)

from modules.exclusions import excluded_strings, excluded_patterns

# Define the primary source file you're working with
hansard_source_file_name = '4.September26-November07-1901'  
# Define the hathi_starting_page. This is the unique Hathi identifier for the Google pdf, and the page in that pdf that corresponds to the first page of Hansard in the original digitized source. It is used to concatenate the url to the Hathi Trust source in the last cell of the notebook.
hathi_id = 'uc1.32106019788261'
hathi_starting_page = 41  

# Concatenate hansard_source_file to source_dir and output_dir_path
source_dir_path = os.path.join('../source', hansard_source_file_name)
output_pages_dir_path = os.path.join('../output', 'pages', hansard_source_file_name)
output_sessions_dir_path = os.path.join('../output', 'sessions', hansard_source_file_name)

# Check if source directory exists
if not os.path.exists(source_dir_path):
    os.makedirs(source_dir_path)

# Check if output directory exists
if not os.path.exists(output_pages_dir_path):
    os.makedirs(output_pages_dir_path)

# Create logs directory if it doesn't exist
logs_dir_path = os.path.join('../logs')
if not os.path.exists(logs_dir_path):
    os.makedirs(logs_dir_path)

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Get the root logger
logger = logging.getLogger()

# If the logger has handlers, remove them
if logger.hasHandlers():
    logger.handlers.clear()

# Create a new file handler that overwrites the log file each time
log_file_path = os.path.join(logs_dir_path, '4.September26-November07-1901_exclusions.log')
file_handler = logging.FileHandler(log_file_path, mode='w')
file_handler.setLevel(logging.INFO)

# Create a logging format
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(file_handler)

# Log the excluded strings and patterns
logger.info(f"Excluded strings: {excluded_strings}, Excluded patterns: {excluded_patterns}")

In [2]:
# Use the OCR content (manually produced in and downloaded from Microsoft AI Studio) and associated positional data to generate page files from the two column layout, removing digitization metadata.

# Get all files in the source directory
files_in_directory = os.listdir(source_dir_path)

# Filter for json files
json_files = [file for file in files_in_directory if file.endswith('.json')]

# Now you can iterate over the json_files list and process each file
for json_file in json_files:
    input_source_file = os.path.join(source_dir_path, json_file)

    # Load the JSON file
    with open(input_source_file) as f:
        data = json.load(f)

    # Access the "analyzeResult"
    analyze_result = data['analyzeResult']

    # Iterate over each page
    for page in analyze_result['pages']:
        # Create a variable to store the page content
        page_content = ""

        # Calculate the x-coordinate that separates the two columns
        column_separator = page['width'] / 2

        # Divide the text lines into two groups based on their x-coordinate
        left_column_lines = [line for line in page['lines'] if min(line['polygon'][::2]) < column_separator]
        right_column_lines = [line for line in page['lines'] if min(line['polygon'][::2]) >= column_separator]

        # Sort each group of text lines by their y-coordinate (top to bottom)
        left_column_lines.sort(key=lambda line: min(line['polygon'][1::2]))
        right_column_lines.sort(key=lambda line: min(line['polygon'][1::2]))

        # Combine the sorted text lines from the two columns
        sorted_lines = left_column_lines + right_column_lines


        # Define the areas of the page you want to exclude
        x_range = (0, 0.1 * page['width'])  # Capture the left ~10% of the page. Callibrate to suit.
        y_range = (0.95 * page['height'], page['height'])  # Capture the bottom ~10% of the page. Callibrate to suit.
        header_y_range = (0, 0.1 * page['height'])  # Capture the top ~10% of the page. Callibrate to suit.

        # Iterate over each text line in the page
        for line in sorted_lines:
            # Check if any of the y-coordinates of the line fall within the y_range
            y_coordinates = line['polygon'][1::2]
            if any(y_range[0] <= y <= y_range[1] for y in y_coordinates) or any(header_y_range[0] <= y <= header_y_range[1] for y in y_coordinates):
                # If y-coordinate condition is met, log the line and continue to next iteration
                logging.info(f"Excluded due to y_range or header_y_range: {line}")
                continue

            # If condition is false, add the line to the page content
            content = line['content']
            page_content += content + " "

        # Write the page content to a new file in the output directory
        output_file_path = os.path.join(output_pages_dir_path, f'page_{page["pageNumber"]}.md')
        with open(output_file_path, 'w') as f:
            f.write(page_content)

In [3]:
# Combine the pages into a single master content file, for cleaning.

combined_file = os.path.join(output_pages_dir_path, 'all_pages_master.txt')

def sort_key(filename):
    match = re.search(r'page_(\d+)', filename)
    if match:
        return int(match.group(1))
    return float('inf')

with open(combined_file, 'w') as outfile:
    for filename in sorted(os.listdir(output_pages_dir_path), key=sort_key):
        if filename.startswith('page_') and filename.endswith('.md'):
            page_number = filename.replace('page_', '').replace('.md', '')
            outfile.write(f'<page:{page_number}>\n')
            with open(os.path.join(output_pages_dir_path, filename), 'r') as infile:
                for line in infile:
                    outfile.write(line.lstrip())  # strip leading whitespace
                outfile.write('\n')


In [4]:
# Clean the master content file, using local processing

# Read the content
with open(combined_file, 'r') as file:
    lines = file.readlines()

# Remove patterns matching excluded patterns
for pattern in excluded_patterns:
    lines = [re.sub(pattern, '', line) for line in lines]

# Remove strings matching excluded strings
for string in excluded_strings:
    logging.info(f"Removing string: {string}")
    lines = [line.replace(string, '') for line in lines]

# Remove unwanted indentation
lines = [line.lstrip() for line in lines]

# Write the cleaned content back
master_file_path = os.path.join(output_pages_dir_path, 'all_pages_master.txt')
with open(master_file_path, 'w') as file:
    file.writelines(lines)

In [5]:
# Split the cleaned master file into sessions, adding page tags.

# Open the master file
with open(master_file_path, 'r') as file:
    # Read the content
    content = file.read()

    # Define the pattern for 'LEGISLATIVE COUNCIL.' followed by a date
    pattern = r'(LEGISLATIVE COUNCIL\.\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), (?:\d{1,2}|[IVX]{1,3})(?:st|nd|rd|th) \w+, \d{4})'

    # Find all matches of the pattern
    matches = re.findall(pattern, content)

    # Split the content based on the pattern
    parts = re.split(pattern, content)

    # Iterate over the parts, skipping the first
    for i in range(1, len(parts), 2):
        # Extract the date string following 'LEGISLATIVE COUNCIL.'
        date_match = re.search(r'(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), (?:\d{1,2}|[IVX]{1,3})(?:st|nd|rd|th) \w+, \d{4}', parts[i])
        date = date_match.group() if date_match else 'No date'

        # Format the filename
        filename = f'{i//2+1}-{date}.txt'

        # Concatenate output_dir_path with subdirectory and filename
        file_path = os.path.join(output_sessions_dir_path, filename)

        # Ensure the directory exists
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # Check if the part starts with a <page> tag
        if not parts[i].strip().startswith('<page>'):
            # Find the first <page> tag in the part
            page_tag_match = re.search(r'<page:(\d+)>', parts[i+1])
            if page_tag_match:
                # Derive the first <page> tag
                first_page_number = int(page_tag_match.group(1)) - 1
                first_page_tag = f'<page>{first_page_number}</page>\n'
                # Prepend the first <page> tag to the part
                parts[i] = first_page_tag + parts[i]

        # Reformat <page:123> to <page>123</page>
        parts[i] = re.sub(r'<page:(\d+)>', r'<page>\1</page>', parts[i])
        parts[i+1] = re.sub(r'<page:(\d+)>', r'<page>\1</page>', parts[i+1])

        # Open the file using the variable
        with open(file_path, 'w', encoding='utf-8') as file:
            # Write the matched pattern and the following part to the file
            file.write(parts[i] + parts[i+1])

In [6]:
# Needed for this volume, to manage inconsistencies between the page numbers of the (edited) version of the Google pdf used for OCR and the Hathi PDF. This can be considered a bug. The best solution would be to re-transcribe (produce new .json source files) from the original Google Pdfs, although this would require additional post-OCR cleaning.

def get_last_page_numbers(output_sessions_dir_path):
    last_page_numbers = {}

    # Sort filenames based on the numeric part at the beginning
    filenames = sorted(os.listdir(output_sessions_dir_path), key=lambda x: int(re.findall(r'^\d+', x)[0]) if re.findall(r'^\d+', x) else float('inf'))

    for filename in filenames:
        if filename.endswith(".txt"):
            filepath = os.path.join(output_sessions_dir_path, filename)
            with open(filepath, 'r') as file:
                content = file.read()

            # Find all <page> tags
            page_tags = re.findall(r'<page>\d+</page>', content)
            if page_tags:
                # Extract the last page number
                last_page_number = int(re.findall(r'\d+', page_tags[-1])[0])
                last_page_numbers[filename] = last_page_number

    return last_page_numbers

def reset_page_tags(directory, last_page_numbers):
    page_number = 1
    previous_last_page_number = None
    previous_filename = None

    # Sort filenames based on the numeric part at the beginning
    filenames = sorted(os.listdir(directory), key=lambda x: int(re.findall(r'^\d+', x)[0]) if re.findall(r'^\d+', x) else float('inf'))

    for filename in filenames:
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                content = file.read()

            # Find all <page> tags
            page_tags = re.findall(r'<page>\d+</page>', content)

            if page_tags:
                first_page_number = int(re.findall(r'\d+', page_tags[0])[0])

                if previous_last_page_number is not None and first_page_number == previous_last_page_number:
                    # Retrieve the updated last page number from the previous file
                    if previous_filename:
                        previous_filepath = os.path.join(directory, previous_filename)
                        with open(previous_filepath, 'r') as prev_file:
                            prev_content = prev_file.read()
                        prev_page_tags = re.findall(r'<page>\d+</page>', prev_content)
                        if prev_page_tags:
                            new_last_page_number = int(re.findall(r'\d+', prev_page_tags[-1])[0])
                            # Start page numbering using new_last_page_number
                            page_number = new_last_page_number

            # Replace each <page> tag with the new sequence number
            for tag in page_tags:
                new_tag = f'<page>{page_number}</page>'
                content = content.replace(tag, new_tag, 1)
                page_number += 1

            # Write the modified content back to the file
            with open(filepath, 'w') as file:
                file.write(content)

            # Update the previous last page number and filename
            if page_tags:
                previous_last_page_number = int(re.findall(r'\d+', page_tags[-1])[0])
                previous_filename = filename

# First pass to get the last page numbers
last_page_numbers = get_last_page_numbers(output_sessions_dir_path)

# Second pass to reset the page tags
reset_page_tags(output_sessions_dir_path, last_page_numbers)

In [7]:
# Concatenate urls to the Hathi Trust source documents, and add them below the page numbers.

# Initialize the current Hathi page number
current_hathi_page = hathi_starting_page

def process_file(file_path, hathi_id, hathi_starting_page):
    global current_hathi_page
    
    print(f"Processing file: {file_path}")

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()    
 
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            # Keep the original page number
            page_number = int(page_match.group(1))
            new_lines.append(f'<page>{page_number}</page>')
            
            # Calculate the HathiTrust page number based on the delta
            hathi_page_number = hathi_starting_page + (page_number - 1)
            url = f'https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={hathi_page_number}'
            new_lines.append(f'<url>{url}</url>')
            print(f"Updated line: <page>{page_number}</page> and <url>{url}</url>")
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

    print(f"Finished processing file: {file_path}")

# Iterate over each file in the subdirectory
for filename in sorted(os.listdir(output_sessions_dir_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(output_sessions_dir_path, filename)
        process_file(file_path, hathi_id, hathi_starting_page)

Processing file: ../output/sessions/4.September26-November07-1901/1-Thursday, 26th September, 1901.txt
Updated line: <page>1</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=41</url>
Updated line: <page>2</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=42</url>
Updated line: <page>3</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=43</url>
Updated line: <page>4</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=44</url>
Updated line: <page>5</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=45</url>
Updated line: <page>6</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=46</url>
Updated line: <page>7</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=47</url>
Updated line: <page>8</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788261&seq=48</url>
Updated line: <page>9</pa