This notebook could be added to prepare_ocr_sessions but is included separately because it is a distinct step in the processing pipeline that requires configuration per source pdf (in both cells).

Documentation, to set session folder for processing and Hathi ID for url references:
- session = '1.July01-July26-1901' 
- hathi_id = 'uc1.32106019788238'
- starting_page = 29
- session = '2.July30-August28-1901' 
- hathi_id = 'uc1.32106019788246'
- starting_page = 31
- session = '3.August29-September25-1901'
- hathi_id = 'uc1.32106019788253'
- starting_page = 21
- session = '4.September26-November07-1901'
- hathi_id = 'uc1.32106019788261'
- starting_page = 41

In [1]:
# Reformat <page> tags to match AU and UK.

import os
import re

# Define session variable
session = '1.July01-July26-1901'

# Define subdirectory path
subdirectory_path = 'output/sessions'

# Construct the directory path dynamically
directory = os.path.join(subdirectory_path, session)

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the dynamically constructed directory
replace_text_in_files(directory)

In [3]:
# Correct page numbers so they refer to the Hansard source rather than the Google pdf, and add the corresponding URL for the Hathi online version (see documentation in first cell).

# Define the hathi_starting_page, using the documentation in the cell above. This is the page in the Hathi Trust Google PDF that corresponds to the first page of Hansard in the original digitized source.
hathi_id = 'uc1.32106019788238'
hathi_starting_page = 29
delete_unwanted = 'No'  # Option to delete unwanted content before a certain page

# Construct the path to the subdirectory
subdirectory_path = os.path.join('output/sessions', session)

# Initialize the global page number
global_page_number = 1
unwanted_deleted = False

def process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted):
    global global_page_number
    global unwanted_deleted

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
    if delete_unwanted == 'Yes' and not unwanted_deleted:
        start_index = content.find(f'<page>{hathi_starting_page}</page>')
        if start_index != -1:
            content = content[start_index:]
            unwanted_deleted = True
    
    # Find all <page> tags and their corresponding numbers
    page_tags = re.findall(r'<page>(\d+)</page>', content)
    print(f"Found page tags: {page_tags}")
    
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            page_number = int(page_match.group(1))
            url_page_number = page_number + hathi_starting_page - 1
            print(f"Original page number: {page_number}, URL page number: {url_page_number}")
            new_lines.append(line)
            url = f'<url>https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={url_page_number}</url>'
            new_lines.append(url)
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content back to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

# Iterate over each file in the subdirectory
for filename in os.listdir(subdirectory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(subdirectory_path, filename)
        process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted)

Found page tags: ['276', '277', '278', '279', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '290', '291', '292', '293', '294', '295', '296', '297', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '320', '321', '322', '323', '324', '325']
Original page number: 276, URL page number: 304
Original page number: 277, URL page number: 305
Original page number: 278, URL page number: 306
Original page number: 279, URL page number: 307
Original page number: 280, URL page number: 308
Original page number: 281, URL page number: 309
Original page number: 282, URL page number: 310
Original page number: 283, URL page number: 311
Original page number: 284, URL page number: 312
Original page number: 285, URL page number: 313
Original page number: 286, URL page number: 314
Original page number: 287, URL page number: 315
Original page number: 288, URL page number: 316
Original