In [15]:
# Reformat <page> tags to match AU and UK.

import os
import re

# Define session variable
session = '1.July01-July26-1901'

# Define subdirectory path
subdirectory_path = 'output/sessions'

# Construct the directory path dynamically
directory = os.path.join(subdirectory_path, session)

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the dynamically constructed directory
replace_text_in_files(directory)

In [16]:
# Define the hathi_starting_page, using the documentation in the cell above. This is the page in the Hathi Trust Google PDF that corresponds to the first page of Hansard in the original digitized source.
hathi_id = 'uc1.32106019788238'
hathi_starting_page = 29  
delete_unwanted = 'No'  # Option to delete unwanted content before a certain page
delete_before_page = 31  # Define the page before which all content will be deleted

# Construct the path to the subdirectory
subdirectory_path = os.path.join('output/sessions', session)  # 'session' is already defined

# Initialize the current Hathi page number
current_hathi_page = hathi_starting_page

def process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page):
    global current_hathi_page
    unwanted_deleted = False

    print(f"Processing file: {file_path}")

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
    if delete_unwanted == 'Yes':
        start_index = content.find(f'<page>{delete_before_page}</page>')
        if start_index != -1:
            content = content[start_index:]
            print(f"Unwanted content deleted.")
    
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            # Keep the original page number
            page_number = int(page_match.group(1))
            new_lines.append(f'<page>{page_number}</page>')
            
            # Calculate the HathiTrust page number based on the delta
            hathi_page_number = hathi_starting_page + (page_number - 1)
            url = f'https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={hathi_page_number}'
            new_lines.append(f'<url>{url}</url>')
            print(f"Updated line: <page>{page_number}</page> and <url>{url}</url>")
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

    print(f"Finished processing file: {file_path}")

# Iterate over each file in the subdirectory
for filename in sorted(os.listdir(subdirectory_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(subdirectory_path, filename)
        process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page)

Processing file: output/sessions/1.July01-July26-1901/1-Monday, 1st July, 1901.txt
Updated line: <page>1</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=29</url>
Updated line: <page>2</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=30</url>
Updated line: <page>3</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=31</url>
Updated line: <page>4</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=32</url>
Updated line: <page>5</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=33</url>
Updated line: <page>6</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=34</url>
Updated line: <page>7</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788238&seq=35</url>
Finished processing file: output/sessions/1.July01-July26-1901/1-Monday, 1st July, 1901.txt
Processing file: output/sessions/1.July01-July26-1901/10-Tuesd