This notebook could be added to prepare_ocr_sessions but is included separately because it is a distinct step in the processing pipeline that requires configuration per source pdf (in both cells).

Documentation, to set session folder for processing and Hathi ID for url references:
- session = '1.July01-July26-1901' 
- hathi_id = 'uc1.32106019788238'
- starting_page = 29
- session = '2.July30-August28-1901' 
- hathi_id = 'uc1.32106019788246'
- starting_page = 31
- session = '3.August29-September25-1901'
- hathi_id = 'uc1.32106019788253'
- starting_page = 21
- session = '4.September26-November07-1901'
- hathi_id = 'uc1.32106019788261'
- starting_page = 41

In [5]:
# Reformat <page> tags to match AU and UK.

import os
import re

# Define session variable
session = '2.July30-August28-1901'

# Define subdirectory path
subdirectory_path = 'output/sessions'

# Construct the directory path dynamically
directory = os.path.join(subdirectory_path, session)

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the dynamically constructed directory
replace_text_in_files(directory)

In [6]:
# Correct page numbers so they refer to the Hansard source rather than the Google pdf, and add the corresponding URL for the Hathi online version (see documentation in first cell).

# Define the hathi_starting_page, using the documentation in the cell above. This is the page in the Hathi Trust Google PDF that corresponds to the first page of Hansard in the original digitized source.
hathi_id = 'uc1.32106019788246'
hathi_starting_page = 31
delete_unwanted = 'Yes'  # Option to delete unwanted content before a certain page
delete_before_page = 25  # Define the page before which all content will be deleted

# Construct the path to the subdirectory
subdirectory_path = os.path.join('output/sessions', session)

# Initialize the global page number
global_page_number = 1
unwanted_deleted = False

def process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page):
    global global_page_number
    global unwanted_deleted

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
    if delete_unwanted == 'Yes' and not unwanted_deleted:
        start_index = content.find(f'<page>{delete_before_page}</page>')
        if start_index != -1:
            content = content[start_index:]
            unwanted_deleted = True
            global_page_number = 1  # Reset the global page number
    
    # Find all <page> tags and their corresponding numbers
    page_tags = re.findall(r'<page>(\d+)</page>', content)
    print(f"Found page tags: {page_tags}")
    
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            # Use the global_page_number instead of the original page number
            url_page_number = global_page_number + hathi_starting_page - 1
            print(f"New page number: {global_page_number}, URL page number: {url_page_number}")
            new_lines.append(f'<page>{global_page_number}</page>')
            url = f'<url>https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={url_page_number}</url>'
            new_lines.append(url)
            global_page_number += 1
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

# Iterate over each file in the subdirectory
for filename in os.listdir(subdirectory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(subdirectory_path, filename)
        process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page)

Found page tags: ['487', '488', '489', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '510', '511', '512', '513', '514', '515', '516']
New page number: 1, URL page number: 25
New page number: 2, URL page number: 26
New page number: 3, URL page number: 27
New page number: 4, URL page number: 28
New page number: 5, URL page number: 29
New page number: 6, URL page number: 30
New page number: 7, URL page number: 31
New page number: 8, URL page number: 32
New page number: 9, URL page number: 33
New page number: 10, URL page number: 34
New page number: 11, URL page number: 35
New page number: 12, URL page number: 36
New page number: 13, URL page number: 37
New page number: 14, URL page number: 38
New page number: 15, URL page number: 39
New page number: 16, URL page number: 40
New page number: 17, URL page number: 41
New page number: 18, URL page number: 42
New page number: 19, URL page number: 43
Ne