In [None]:
import os
import re
from datetime import datetime

# Update the base directory and define new input and output paths
base_directory = '.'
input_directory_path = os.path.join(base_directory, 'xml/hofcoms')
output_directory_base = os.path.join(base_directory, 'txt/hofcoms')

# Log file path
log_file_path = os.path.join(base_directory, 'uk_modification_log.txt')

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(input_directory_path, '1901_index.xml'))
        
def split_xml_files(input_directory_path):
    # Regex to find the date within a section and to split the file content
    date_regex = re.compile(r'<date format="[^"]+">([^<]+)</date>')
    split_regex = re.compile(r'</housecommons>\s*<housecommons>')
    
    for filename in os.listdir(input_directory_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(input_directory_path, filename)
            base_filename = os.path.splitext(filename)[0]
            new_directory_path = os.path.join(output_directory_base, base_filename + "_split")
            os.makedirs(new_directory_path, exist_ok=True)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            sections = split_regex.split(content)
            for index, section in enumerate(sections, start=1):
                if not section.startswith('<housecommons>'):
                    section = '<housecommons>\n\n' + section
                
                match = date_regex.search(section)
                if match:
                    date_str = match.group(1)
                    safe_date_str = date_str.replace('_', ' ').replace(' ', ' ')
                    new_filename = f"{index}-{safe_date_str}txt"  # Corrected file extension
                    new_file_path = os.path.join(new_directory_path, new_filename)
                    
                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(section)

def rename_directories(output_directory_base):
    directories = [d for d in os.listdir(output_directory_base) if os.path.isdir(os.path.join(output_directory_base, d)) and '_split' in d]
    directories.sort(key=lambda x: datetime.strptime(x.split('_')[0], '%Y%m%d'))
    
    for index, directory in enumerate(directories, start=1):
        try:
            dates_str = directory.split('_split')[0]
            start_date_str, end_date_str = dates_str.split('_')
            
            start_date = datetime.strptime(start_date_str, '%Y%m%d')
            end_date = datetime.strptime(end_date_str, '%Y%m%d')
            
            formatted_start_date = start_date.strftime('%B%d')
            formatted_end_date = end_date.strftime('%B%d')
            year = end_date.strftime('%Y')
            
            new_directory_name = f"{index}.{formatted_start_date}-{formatted_end_date}-{year}"
            
            old_directory_path = os.path.join(output_directory_base, directory)
            new_directory_path = os.path.join(output_directory_base, new_directory_name)
            os.rename(old_directory_path, new_directory_path)
            
            log_change(old_directory_path, f'Renamed directory to {new_directory_name}')
        except ValueError:
            log_change(directory, 'Skipped renaming due to unexpected format')

split_xml_files(input_directory_path)
rename_directories(output_directory_base)

In [None]:
def remove_specified_content(text):
    patterns = [
        re.compile(r'<\?xml.*?\?>', re.DOTALL),
        re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL),
        re.compile(r'<tablecontents>.*?</tablecontents>', re.DOTALL),
        re.compile(r'<titlepage>.*?</titlepage>', re.DOTALL),
        re.compile(r'<houselords>.*?</houselords>', re.DOTALL),
        re.compile(r'<col>.*?</col>', re.DOTALL),
        re.compile(r'<image src="[^"]*"/>', re.DOTALL),
        re.compile(r'<hansard.*?>', re.DOTALL)
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

def clean_text(text):
    text = re.sub(r'&#x2014', '', text)  # Remove the specific string '&#x2014'
    text = re.sub(r'&#x00A3', '£', text)  # Replace '&#x00A3' with '£'
    text = re.sub(r'</title>', '\n', text)
    text = re.sub(r'</date>', '\n', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    text = text.strip()
    return text

for root, dirs, files in os.walk(output_directory_base):
    # Skip files directly under directory_path
    if root == directory_path:
        continue
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Apply both cleaning functions
            cleaned_content = remove_specified_content(content)
            cleaned_content = clean_text(cleaned_content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

In [None]:
# Construct a url and append it to the top of each page, for citation purposes

def construct_url_from_filename(filename):
    base_url = "https://api.parliament.uk/historic-hansard/sittings/"
    match = re.search(r'(\d+)-.*?(\d{1,2})(?:st|nd|rd|th) ([A-Za-z]+), (\d{4})\.txt$', filename)
    if match:
        day, month, year = match.group(2), match.group(3).lower(), match.group(4)
        month_abbreviations = {
            "january": "jan", "february": "feb", "march": "mar", "april": "apr",
            "may": "may", "june": "jun", "july": "jul", "august": "aug",
            "september": "sep", "october": "oct", "november": "nov", "december": "dec"
        }
        month_abbr = month_abbreviations.get(month, "")
        url = f"{base_url}{year}/{month_abbr}/{day}"
        return f"<url>{url}</url>\n"
    return ""

for root, dirs, files in os.walk(output_directory_base):
    if root == output_directory_base:
        continue
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r+', encoding='utf-8') as f:
                content = f.read()
                url_string = construct_url_from_filename(file)
                f.seek(0, 0)
                f.write(url_string + content)

In [1]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os

# Define the root input and output directories
input_root_directory = 'txt/hofcoms'
output_root_directory = 'pdf/hofcom'

# Ensure the output root directory exists
os.makedirs(output_root_directory, exist_ok=True)

def convert_text_to_pdf(input_root_dir, output_root_dir):
    for root, dirs, files in os.walk(input_root_dir):
        for filename in files:
            if filename.endswith('.txt'):
                try:
                    input_file_path = os.path.join(root, filename)
                    relative_path = os.path.relpath(root, input_root_dir)
                    output_dir = os.path.join(output_root_dir, relative_path)
                    os.makedirs(output_dir, exist_ok=True)
                    output_file_path = os.path.join(output_dir, filename.replace('.txt', '.pdf'))
                    
                    # Initialize the PDF
                    c = canvas.Canvas(output_file_path, pagesize=letter)
                    width, height = letter
                    margin = 72  # 1 inch
                    current_height = height - margin
                    line_height = 14  # Adjust as needed
                    
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        lines = file.readlines()
                        
                        # Process the header line (URL)
                        header = lines[0].strip()
                        c.setFont("Helvetica-Bold", 12)
                        c.drawString(margin, current_height, header)
                        current_height -= line_height * 2  # Extra space after the header
                        
                        # Reset font for the body text
                        c.setFont("Helvetica", 10)
                        
                        # Process the remainder of the document
                        for line in lines[1:]:
                            if current_height <= margin + line_height:  # Check if we need a new page
                                c.showPage()
                                current_height = height - margin
                            c.drawString(margin, current_height, line.strip())
                            current_height -= line_height
                    
                    c.save()
                    print(f"Converted {input_file_path} to {output_file_path}")
                except Exception as e:
                    print(f"Failed to convert {filename} due to error: {e}")

convert_text_to_pdf(input_root_directory, output_root_directory)