In [None]:
# This script is used to correct page numbers in the ocr_output .txt files so that the resulting page numbers reflect the page number of the original Hansard source rather than the Google pdf (so, the page number at the top of the Hansard pdf images, not your pdf viewer.)
# Configure the variables / targets and run only as and when required.

import os
import re

directory = 'output/sessions/3.August29-September25-1901'
start_page = 17  # The page number in the original document where counting starts
delete_unwanted = 'Yes'  # Flag to delete all content before start_page

def correct_page_numbers(start_page, delete_unwanted):
    # Get a list of all .txt files in the directory
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    # Sort the list of files based on the first number in the filename
    files.sort(key=lambda f: int(re.search(r'(\d+)-', f).group(1)))
    # Initialize the page number to start from 1
    global_page_number = 1
    # Flag to check if unwanted content has been deleted
    unwanted_deleted = False
    # For each file in the list
    for file in files:
        # Open the file and read the content
        with open(os.path.join(directory, file), 'r+') as f:
            content = f.read()
            # If delete_unwanted is set to 'Yes' and unwanted content has not been deleted yet
            if delete_unwanted == 'Yes' and not unwanted_deleted:
                start_index = content.find(f'<page:{start_page}>')
                if start_index != -1:
                    content = content[start_index:]
                    unwanted_deleted = True
            # Find all page number tags in the format <page:xx>
            page_tags = re.findall(r'<page:(\d+)>', content)
            # Create a dictionary to map original page numbers to new page numbers
            page_map = {}
            for tag in page_tags:
                original_page_number = int(tag)
                page_map[original_page_number] = global_page_number
                global_page_number += 1
            # Replace the original page numbers with the new page numbers
            for original_page_number, new_page_number in page_map.items():
                content = content.replace(f'<page:{original_page_number}>', f'<page:{new_page_number}>')
            # Save the updated content back to the file
            f.seek(0)
            f.write(content)
            f.truncate()
        print(f"Successfully updated file {file}. Changed pageNumbers: {page_map}")

# Call the function with the start_page and delete_unwanted arguments
correct_page_numbers(start_page, delete_unwanted)