In [None]:
# Reformat <page> tags to match AU and UK.
import os
import re

def replace_text_in_files(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Replace <page:123> with <page>123</page>
            new_content = re.sub(r'<page:(\d+)>', r'<page>\1</page>', content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)

# Call the function with the target directory
replace_text_in_files('txt/hofreps')

In [4]:
import os
import re

# Define the folder and starting_page. These need to be done one at a time.
session = '1.July01-July26-1901'
hathi_id='uc1.32106019788238'
starting_page = 29

# Construct the path to the subdirectory
subdirectory_path = os.path.join('txt/hofreps', session)

# Iterate over each file in the subdirectory
for filename in os.listdir(subdirectory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(subdirectory_path, filename)
        
        # Extract the integer from the filename
        match = re.search(r'\d+', filename)
        if match:
            seq_number = int(match.group())
            # Construct the URL using the extracted integer and starting_page
            url = f'<url>https://babel.hathitrust.org/cgi/pt?id={hathi_id}&seq={starting_page + (seq_number - 1)}</url>'
            
            # Read the content of file
            with open(file_path, 'r') as file:
                content = file.read()
            
            # Prepend the URL to the content
            updated_content = f'{url}\n{content}'
            
            # Write the updated content back to the file
            with open(file_path, 'w') as file:
                file.write(updated_content)

In [None]:
#Generate pdf files from txt files

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os
from reportlab.pdfbase.pdfmetrics import stringWidth


input_root_dir = 'txt/hofreps'
output_root_dir = 'pdf/hofreps'

def wrap_text(text, available_width, font_name, font_size):
    words = text.split()
    lines = []
    current_line = ""
    for word in words:
        test_line = f"{current_line} {word}".strip()
        if stringWidth(test_line, font_name, font_size) <= available_width:
            current_line = test_line
        else:
            lines.append(current_line)
            current_line = word
    if current_line:
        lines.append(current_line)
    return lines

def convert_text_to_pdf(input_root_dir, output_root_dir):
    for root, dirs, files in os.walk(input_root_dir):
        for filename in files:
            if filename.endswith('.txt'):
                try:
                    input_file_path = os.path.join(root, filename)
                    relative_path = os.path.relpath(root, input_root_dir)
                    output_dir = os.path.join(output_root_dir, relative_path)
                    os.makedirs(output_dir, exist_ok=True)
                    output_file_path = os.path.join(output_dir, filename.replace('.txt', '.pdf'))
                    
                    c = canvas.Canvas(output_file_path, pagesize=letter)
                    width, height = letter
                    margin = 72
                    current_height = height - margin
                    line_height = 14
                    
                    available_width = width - 2 * margin
                    
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        lines = file.readlines()
                        
                        # Set font for the title
                        c.setFont("Helvetica", 10)
                        for line in wrap_text(lines[0].strip(), available_width, "Helvetica", 10):
                            if current_height - line_height * 2 < margin:
                                c.showPage()
                                current_height = height - margin
                                c.setFont("Helvetica", 10)  # Reset font after creating a new page
                            c.drawString(margin, current_height, line)
                            current_height -= line_height * 2
                        
                        # Set font for the rest of the text
                        c.setFont("Helvetica", 10)
                        for line in lines[1:]:
                            wrapped_lines = wrap_text(line.strip(), available_width, "Helvetica", 10)
                            for wrapped_line in wrapped_lines:
                                if current_height - line_height < margin:
                                    c.showPage()
                                    current_height = height - margin
                                    c.setFont("Helvetica", 10)  # Reset font after creating a new page
                                c.drawString(margin, current_height, wrapped_line)
                                current_height -= line_height
                    
                    c.save()
                    print(f"Converted {input_file_path} to {output_file_path}")
                except Exception as e:
                    print(f"Failed to convert {filename} due to error: {e}")


convert_text_to_pdf(input_root_dir, output_root_dir)