In [13]:
import os
import re
from datetime import datetime

# Define the directory path
directory_path = 'xml/hofcoms'

# Log file path
log_file_path = 'modification_log.txt'

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(directory_path, '1901_index.xml'))
        
def split_xml_files(directory_path):
    # Regex to find the date within a section
    date_regex = re.compile(r'<date format="[^"]+">([^<]+)</date>')
    # Regex to split the file content at </housecommons><housecommons>
    split_regex = re.compile(r'</housecommons>\s*<housecommons>')
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(directory_path, filename)
            # Create a new directory for the split files
            base_filename = os.path.splitext(filename)[0]
            new_directory_path = os.path.join(directory_path, base_filename + "_split")
            os.makedirs(new_directory_path, exist_ok=True)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            sections = split_regex.split(content)
            for index, section in enumerate(sections, start=1):
                # Ensure the section starts with <housecommons>
                if not section.startswith('<housecommons>'):
                    section = '<housecommons>\n\n' + section
                
                # Extract the date for the filename
                match = date_regex.search(section)
                if match:
                    date_str = match.group(1)
                    # Replace characters not allowed in filenames and adjust format
                    safe_date_str = date_str.replace('_', ' ').replace(' ', ' ')
                    # Append an integer based on the order they are split, changing the dot to a dash
                    new_filename = f"{index}-{safe_date_str}txt"
                    new_file_path = os.path.join(new_directory_path, new_filename)
                    
                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(section)

def rename_directories(directory_path):
    directories = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d)) and '_split' in d]
    directories.sort(key=lambda x: datetime.strptime(x.split('_')[0], '%Y%m%d'))  # Sort directories by date for consistent ordering
    
    for index, directory in enumerate(directories, start=1):
        try:
            # Correctly extract dates from the directory name
            dates_str = directory.split('_split')[0]  # Adjusted to split correctly
            start_date_str, end_date_str = dates_str.split('_')  # Adjusted to split on underscore
            
            start_date = datetime.strptime(start_date_str, '%Y%m%d')
            end_date = datetime.strptime(end_date_str, '%Y%m%d')
            
            # Format start and end dates
            formatted_start_date = start_date.strftime('%B%d')
            formatted_end_date = end_date.strftime('%B%d')
            year = end_date.strftime('%Y')
            
            # Construct new directory name
            new_directory_name = f"{index}.{formatted_start_date}-{formatted_end_date}-{year}"
            
            # Rename directory
            old_directory_path = os.path.join(directory_path, directory)
            new_directory_path = os.path.join(directory_path, new_directory_name)
            os.rename(old_directory_path, new_directory_path)
            
            # Log the directory renaming
            log_change(old_directory_path, f'Renamed directory to {new_directory_name}')
        except ValueError:
            # Log directories that do not match the expected format
            log_change(directory, 'Skipped renaming due to unexpected format')

split_xml_files(directory_path)
rename_directories(directory_path)

In [14]:
import os
import re

def remove_specified_content(text):
    patterns = [
        re.compile(r'<\?xml.*?\?>', re.DOTALL),
        re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL),
        re.compile(r'<tablecontents>.*?</tablecontents>', re.DOTALL),
        re.compile(r'<titlepage>.*?</titlepage>', re.DOTALL),
        re.compile(r'<houselords>.*?</houselords>', re.DOTALL),
        re.compile(r'<col>.*?</col>', re.DOTALL),
        re.compile(r'<image src="[^"]*"/>', re.DOTALL),
        re.compile(r'<hansard.*?>', re.DOTALL)
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

def clean_text(text):
    text = re.sub(r'&#x2014', '', text)  # Remove the specific string '&#x2014'
    text = re.sub(r'&#x00A3', '£', text)  # Replace '&#x00A3' with '£'
    text = re.sub(r'</title>', '\n', text)
    text = re.sub(r'</date>', '\n', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    text = text.strip()
    return text

directory_path = 'xml/hofcoms'

for root, dirs, files in os.walk(directory_path):
    # Skip files directly under directory_path
    if root == directory_path:
        continue
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Apply both cleaning functions
            cleaned_content = remove_specified_content(content)
            cleaned_content = clean_text(cleaned_content)
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

In [None]:
# Construct a url and append it to the top of each page, for citation purposes

def construct_url_from_filename(filename):
    base_url = "https://api.parliament.uk/historic-hansard/sittings/"
    match = re.search(r'(\d+)-.*?(\d{1,2})(?:st|nd|rd|th) ([A-Za-z]+), (\d{4})\.txt$', filename)
    if match:
        day, month, year = match.group(2), match.group(3).lower(), match.group(4)
        month_abbreviations = {
            "january": "jan", "february": "feb", "march": "mar", "april": "apr",
            "may": "may", "june": "jun", "july": "jul", "august": "aug",
            "september": "sep", "october": "oct", "november": "nov", "december": "dec"
        }
        month_abbr = month_abbreviations.get(month, "")
        url = f"{base_url}{year}/{month_abbr}/{day}"
        return f"<url>{url}</url>\n"
    return ""

for root, dirs, files in os.walk(directory_path):
    if root == directory_path:
        continue
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r+', encoding='utf-8') as f:
                content = f.read()
                url_string = construct_url_from_filename(file)
                f.seek(0, 0)
                f.write(url_string + content)