In [4]:
# Reformat <page> tags to match AU and UK.

import os
import re

# Define session variable
session = '2.July30-August28-1901'

# Define subdirectory path
subdirectory_path = 'output/sessions'

# Construct the directory path dynamically
directory = os.path.join(subdirectory_path, session)

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the dynamically constructed directory
replace_text_in_files(directory)

In [5]:
# Needed for this volume, to manage inconsistencies between the page numbers of the Google pdf and the Hathi PDF. This can be considered a bug. The best solution would be to re-transcribe (produce new .json source files).

def get_last_page_numbers(directory):
    last_page_numbers = {}

    # Sort filenames based on the numeric part at the beginning
    filenames = sorted(os.listdir(directory), key=lambda x: int(re.findall(r'^\d+', x)[0]) if re.findall(r'^\d+', x) else float('inf'))

    for filename in filenames:
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                content = file.read()

            # Find all <page> tags
            page_tags = re.findall(r'<page>\d+</page>', content)
            if page_tags:
                # Extract the last page number
                last_page_number = int(re.findall(r'\d+', page_tags[-1])[0])
                last_page_numbers[filename] = last_page_number

    return last_page_numbers

def reset_page_tags(directory, last_page_numbers):
    page_number = 1
    previous_last_page_number = None
    previous_filename = None

    # Sort filenames based on the numeric part at the beginning
    filenames = sorted(os.listdir(directory), key=lambda x: int(re.findall(r'^\d+', x)[0]) if re.findall(r'^\d+', x) else float('inf'))

    for filename in filenames:
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                content = file.read()

            # Find all <page> tags
            page_tags = re.findall(r'<page>\d+</page>', content)

            if page_tags:
                first_page_number = int(re.findall(r'\d+', page_tags[0])[0])

                if previous_last_page_number is not None and first_page_number == previous_last_page_number:
                    # Retrieve the updated last page number from the previous file
                    if previous_filename:
                        previous_filepath = os.path.join(directory, previous_filename)
                        with open(previous_filepath, 'r') as prev_file:
                            prev_content = prev_file.read()
                        prev_page_tags = re.findall(r'<page>\d+</page>', prev_content)
                        if prev_page_tags:
                            new_last_page_number = int(re.findall(r'\d+', prev_page_tags[-1])[0])
                            # Start page numbering using new_last_page_number
                            page_number = new_last_page_number

            # Replace each <page> tag with the new sequence number
            for tag in page_tags:
                new_tag = f'<page>{page_number}</page>'
                content = content.replace(tag, new_tag, 1)
                page_number += 1

            # Write the modified content back to the file
            with open(filepath, 'w') as file:
                file.write(content)

            # Update the previous last page number and filename
            if page_tags:
                previous_last_page_number = int(re.findall(r'\d+', page_tags[-1])[0])
                previous_filename = filename

# First pass to get the last page numbers
last_page_numbers = get_last_page_numbers(directory)

# Second pass to reset the page tags
reset_page_tags(directory, last_page_numbers)

In [6]:
# Define the hathi_starting_page, using the documentation in the cell above. This is the page in the Hathi Trust Google PDF that corresponds to the first page of Hansard in the original digitized source.
hathi_id = 'uc1.32106019788246'
hathi_starting_page = 31  
delete_unwanted = 'No'  # Option to delete unwanted content before a certain page
delete_before_page = 31  # Define the page before which all content will be deleted

# Construct the path to the subdirectory
subdirectory_path = os.path.join('output/sessions', session)  # 'session' is already defined

# Initialize the current Hathi page number
current_hathi_page = hathi_starting_page

def process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page):
    global current_hathi_page
    unwanted_deleted = False

    print(f"Processing file: {file_path}")

    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    
    # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
    if delete_unwanted == 'Yes':
        start_index = content.find(f'<page>{delete_before_page}</page>')
        if start_index != -1:
            content = content[start_index:]
            print(f"Unwanted content deleted.")
    
    # Split the content by lines
    lines = content.split('\n')
    new_lines = []
    
    # Iterate over each line and process <page> tags
    for line in lines:
        page_match = re.match(r'<page>(\d+)</page>', line)
        
        if page_match:
            # Keep the original page number
            page_number = int(page_match.group(1))
            new_lines.append(f'<page>{page_number}</page>')
            
            # Calculate the HathiTrust page number based on the delta
            hathi_page_number = hathi_starting_page + (page_number - 1)
            url = f'https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={hathi_page_number}'
            new_lines.append(f'<url>{url}</url>')
            print(f"Updated line: <page>{page_number}</page> and <url>{url}</url>")
        else:
            url_match = re.match(r'<url>https://babel.hathitrust.org/cgi/pt\?id=.*&seq=\d+</url>', line)
            if url_match:
                # Skip the existing URL line
                continue
            else:
                new_lines.append(line)
    
    # Join the new lines to form the updated content
    updated_content = '\n'.join(new_lines)
    
    # Write the updated content to the file
    with open(file_path, 'w') as file:
        file.write(updated_content)

    print(f"Finished processing file: {file_path}")

# Iterate over each file in the subdirectory
for filename in sorted(os.listdir(subdirectory_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(subdirectory_path, filename)
        process_file(file_path, hathi_id, hathi_starting_page, delete_unwanted, delete_before_page)

Processing file: output/sessions/2.July30-August28-1901/1-Tuesday, 30th July, 1901.txt
Updated line: <page>1</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=31</url>
Updated line: <page>2</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=32</url>
Updated line: <page>3</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=33</url>
Updated line: <page>4</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=34</url>
Updated line: <page>5</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=35</url>
Updated line: <page>6</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=36</url>
Updated line: <page>7</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=37</url>
Updated line: <page>8</page> and <url>https://babel.hathitrust.org/cgi/pt?id=uc1.32106019788246&seq=38</url>
Updated line: <page>9</page> and <url>htt