This notebook could be added to prepare_ocr_sessions but is included separately because it is a distinct step in the processing pipeline that requires configuration per source pdf (in both cells).

Documentation, to set session folder for processing and Hathi ID for url references:
- session = '1.July01-July26-1901' 
- hathi_id = 'uc1.32106019788238'
- starting_page = 1
- session = '2.July30-August28-1901' 
- hathi_id = 'uc1.32106019788246'
- starting_page = 25
- session = '3.August29-September25-1901'
- hathi_id = 'uc1.32106019788253'
- starting_page = 17
- session = '4.September26-November07-1901'
- hathi_id = 'uc1.32106019788261'
- starting_page = 41

In [20]:
# Reformat <page> tags to match AU and UK.

import os
import re

# Define session variable
session = '1.July01-July26-1901'

# Define subdirectory path
subdirectory_path = 'output/sessions'

# Construct the directory path dynamically
directory = os.path.join(subdirectory_path, session)

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the dynamically constructed directory
replace_text_in_files(directory)

In [22]:
# Correct page numbers so they refer to the Hansard source rather than the Google pdf, and add the corresponding URL for the Hathi online version (see documentation in first cell).

# Define the hathi_starting_page, using the documentation in the cell above. This is the page in the Hathi Trust Google PDF that corresponds to the first page of Hansard in the original digitized source.
hathi_id = 'uc1.32106019788238'
hathi_starting_page = 29  
delete_unwanted = 'No'  # Option to delete unwanted content before a certain page
delete_before_page = 31  # Define the page before which all content will be deleted

# Construct the path to the subdirectory
subdirectory_path = os.path.join('output/sessions', session)  # 'session' is already defined

def process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page, starting_page_number):
    unwanted_deleted = False
    local_page_number = starting_page_number

    print(f"Processing file: {file_path}")

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
    if delete_unwanted == 'Yes' and not unwanted_deleted:
        start_index = content.find(f'<page>{delete_before_page}</page>')
        if start_index != -1:
            content = content[start_index:]
            unwanted_deleted = True
            local_page_number = starting_page_number  # Reset the local page number
            print(f"Unwanted content deleted. Reset local_page_number to {local_page_number}")
    
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            # Use the local_page_number instead of the original page number
            url_page_number = local_page_number + hathi_starting_page - 1
            new_lines.append(f'<page>{local_page_number}</page>')
            url = f'<url>https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={url_page_number}</url>'
            new_lines.append(url)
            print(f"Updated line: <page>{local_page_number}</page> and {url}")
            local_page_number += 1
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

    print(f"Finished processing file: {file_path}")

# Define the starting page numbers for each session file
starting_page_numbers = {
    '1-Monday, 1st July, 1901.txt': 1,
    '2-Tuesday, 2nd July, 1901.txt': 7,
    # Add more files and their starting page numbers as needed
}

# Iterate over each file in the subdirectory
for filename in sorted(os.listdir(subdirectory_path)):
    if filename.endswith('.txt') and filename in starting_page_numbers:
        file_path = os.path.join(subdirectory_path, filename)
        starting_page_number = starting_page_numbers[filename]
        process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page, starting_page_number)

Processing file: output/sessions/1.July01-July26-1901/1-Monday, 1st July, 1901.txt
Finished processing file: output/sessions/1.July01-July26-1901/1-Monday, 1st July, 1901.txt
Processing file: output/sessions/1.July01-July26-1901/2-Tuesday, 2nd July, 1901.txt
Finished processing file: output/sessions/1.July01-July26-1901/2-Tuesday, 2nd July, 1901.txt
