In [7]:
import os
from datetime import datetime
import xml.etree.ElementTree as ET

# Base directories
source_base_dir = "xml"
target_base_dir = "txt"

# Modified convert_filename function to include count
def convert_filename(filename, count):
    date_part = filename.split('_')[0]  # Extract date part
    date_obj = datetime.strptime(date_part, '%Y%m%d')
    # Include the day of the week and prepend count in the new filename
    new_filename = f"{count}-{date_obj.strftime('%A, %d %B, %Y')}.txt"
    return new_filename

# Modified process_files function to track file count per year
def process_files(subdir):
    source_dir = os.path.join(source_base_dir, subdir)
    target_dir = os.path.join(target_base_dir, subdir)
    os.makedirs(target_dir, exist_ok=True)  # Ensure target directory exists

    yearly_count = {}  # Dictionary to keep track of file counts per year

    for filename in sorted(os.listdir(source_dir)):  # Sort to ensure chronological order
        if filename.endswith('.xml'):
            year = filename.split('_')[0][:4]  # Extract year part from filename
            yearly_count[year] = yearly_count.get(year, 0) + 1  # Increment count for the year

            new_filename = convert_filename(filename, yearly_count[year])
            source_file_path = os.path.join(source_dir, filename)
            target_file_path = os.path.join(target_dir, new_filename)

            # Read XML and save content to TXT
            tree = ET.parse(source_file_path)
            root = tree.getroot()
            with open(target_file_path, 'w') as txt_file:
                txt_file.write(ET.tostring(root, encoding='unicode'))

# Process files for both 'hofreps' and 'senate'
process_files('hofreps')
process_files('senate')

In [8]:
import re
import os  # Ensure os is imported for directory walking

# Directories to process
directories = ['txt/hofreps', 'txt/senate']

def clean_and_deduplicate_pages(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    content = content.replace('<page.no>', '<page>').replace('</page.no>', '</page>')
    pages = re.findall(r'(<page>(.*?)</page>)', content)
    seen = {}
    cleaned_content = content
    for match in reversed(pages):
        full_tag, page_content = match
        if page_content in seen:
            cleaned_content = cleaned_content.replace(full_tag, '', 1)
        else:
            seen[page_content] = True
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)

def remove_tags(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    content = re.sub(r'<hansard xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="../../hansard.xsd" version="2.1">', '', content)
    lines = content.splitlines()
    modified_content = []
    for line in lines:
        if not re.search(r'(<proof>.*?</proof>|<day\.start>.*?</day\.start>|<type>.*?</type>|<time\.stamp\s*/>|<name\.id>[^<]*</name\.id>|<electorate>[^<]*</electorate>|<party>.*?</party>|<role\s*/>|<in\.gov>[^<]*</in\.gov>|<first\.speech>[^<]*</first\.speech>|<name\s+role="metadata">.*?</name>|<parliament\.no>.*?</parliament\.no>|<session\.no>.*?</session\.no>|<period\.no>.*?</period\.no>|<chamber>.*?</chamber>)', line):
            modified_content.append(line)
    # Remove empty lines
    content = re.sub(r'^\s*$\n', '', content, flags=re.MULTILINE)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(modified_content))

def remove_inline_tags(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Remove <inline> tags and their attributes but keep the content inside them
    content = re.sub(r'<inline[^>]*>', '', content)
    content = content.replace('</inline>', '')
    # Remove <name role="display"> tags but keep the content inside them
    content = re.sub(r'<name role="display">(.*?)</name>', r'\1', content)
    # Remove empty lines
    content = re.sub(r'^\s*$\n', '', content, flags=re.MULTILINE)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def remove_all_remaining_tags(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Remove all XML/HTML-like tags but keep <page> and </page> tags and their content
    def keep_page_tags(match):
        if match.group(0) in ["<page>", "</page>"]:
            return match.group(0)
        return ""
    content = re.sub(r'<[^>]+>', keep_page_tags, content)
    content = re.sub(r'^\s*$\n', '', content, flags=re.MULTILINE)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# Iterate over directories and process each .txt file
for directory in directories:
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                clean_and_deduplicate_pages(file_path)  # Clean and deduplicate pages
                remove_tags(file_path)  # Remove unwanted tags
                remove_inline_tags(file_path)  # Remove <inline> and <name role="display"> tags
                remove_all_remaining_tags(file_path)  # New step to remove all remaining tags

In [None]:
# Construct a url and append it to the top of each page, for citation purposes

def construct_url_from_filename(filename):
    base_url = "https://api.parliament.uk/historic-hansard/sittings/"
    match = re.search(r'(\d+)-.*?(\d{1,2})(?:st|nd|rd|th) ([A-Za-z]+), (\d{4})\.txt$', filename)
    if match:
        day, month, year = match.group(2), match.group(3).lower(), match.group(4)
        month_abbreviations = {
            "january": "jan", "february": "feb", "march": "mar", "april": "apr",
            "may": "may", "june": "jun", "july": "jul", "august": "aug",
            "september": "sep", "october": "oct", "november": "nov", "december": "dec"
        }
        month_abbr = month_abbreviations.get(month, "")
        url = f"{base_url}{year}/{month_abbr}/{day}"
        return f"<url>{url}</url>\n"
    return ""

for root, dirs, files in os.walk(output_directory_base):
    if root == output_directory_base:
        continue
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r+', encoding='utf-8') as f:
                content = f.read()
                url_string = construct_url_from_filename(file)
                f.seek(0, 0)
                f.write(url_string + content)

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os

# Define the root input and output directories
input_root_directory = 'txt/hofcoms'
output_root_directory = 'pdf/hofcom'

# Ensure the output root directory exists
os.makedirs(output_root_directory, exist_ok=True)

def convert_text_to_pdf(input_root_dir, output_root_dir):
    for root, dirs, files in os.walk(input_root_dir):
        for filename in files:
            if filename.endswith('.txt'):
                try:
                    input_file_path = os.path.join(root, filename)
                    relative_path = os.path.relpath(root, input_root_dir)
                    output_dir = os.path.join(output_root_dir, relative_path)
                    os.makedirs(output_dir, exist_ok=True)
                    output_file_path = os.path.join(output_dir, filename.replace('.txt', '.pdf'))
                    
                    # Initialize the PDF
                    c = canvas.Canvas(output_file_path, pagesize=letter)
                    width, height = letter
                    margin = 72  # 1 inch
                    current_height = height - margin
                    line_height = 14  # Adjust as needed
                    
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        lines = file.readlines()
                        
                        # Process the header line (URL)
                        header = lines[0].strip()
                        c.setFont("Helvetica-Bold", 12)
                        c.drawString(margin, current_height, header)
                        current_height -= line_height * 2  # Extra space after the header
                        
                        # Reset font for the body text
                        c.setFont("Helvetica", 10)
                        
                        # Process the remainder of the document
                        for line in lines[1:]:
                            if current_height <= margin + line_height:  # Check if we need a new page
                                c.showPage()
                                current_height = height - margin
                            c.drawString(margin, current_height, line.strip())
                            current_height -= line_height
                    
                    c.save()
                    print(f"Converted {input_file_path} to {output_file_path}")
                except Exception as e:
                    print(f"Failed to convert {filename} due to error: {e}")

convert_text_to_pdf(input_root_directory, output_root_directory)