In [11]:
import os
import re
from datetime import datetime

# Define the directory path
directory_path = 'xml/hofcoms'

# Log file path
log_file_path = 'modification_log.txt'

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(directory_path, '1901_index.xml'))
        
def split_xml_files(directory_path):
    # Regex to find the date within a section
    date_regex = re.compile(r'<date format="[^"]+">([^<]+)</date>')
    # Regex to split the file content at </housecommons><housecommons>
    split_regex = re.compile(r'</housecommons>\s*<housecommons>')
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(directory_path, filename)
            # Create a new directory for the split files
            base_filename = os.path.splitext(filename)[0]
            new_directory_path = os.path.join(directory_path, base_filename + "_split")
            os.makedirs(new_directory_path, exist_ok=True)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            sections = split_regex.split(content)
            for index, section in enumerate(sections, start=1):
                # Ensure the section starts with <housecommons>
                if not section.startswith('<housecommons>'):
                    section = '<housecommons>\n\n' + section
                
                # Extract the date for the filename
                match = date_regex.search(section)
                if match:
                    date_str = match.group(1)
                    # Replace characters not allowed in filenames
                    safe_date_str = date_str.replace(',', '').replace(' ', '_')
                    # Append an integer based on the order they are split
                    new_filename = f"{index}.{safe_date_str}txt"
                    new_file_path = os.path.join(new_directory_path, new_filename)
                    
                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(section)

def rename_directories(directory_path):
    directories = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d)) and '_split' in d]
    directories.sort(key=lambda x: datetime.strptime(x.split('_')[0], '%Y%m%d'))  # Sort directories by date for consistent ordering
    
    for index, directory in enumerate(directories, start=1):
        try:
            # Correctly extract dates from the directory name
            dates_str = directory.split('_split')[0]  # Adjusted to split correctly
            start_date_str, end_date_str = dates_str.split('_')  # Adjusted to split on underscore
            
            start_date = datetime.strptime(start_date_str, '%Y%m%d')
            end_date = datetime.strptime(end_date_str, '%Y%m%d')
            
            # Format start and end dates
            formatted_start_date = start_date.strftime('%B%d')
            formatted_end_date = end_date.strftime('%B%d')
            year = end_date.strftime('%Y')
            
            # Construct new directory name
            new_directory_name = f"{index}.{formatted_start_date}-{formatted_end_date}-{year}"
            
            # Rename directory
            old_directory_path = os.path.join(directory_path, directory)
            new_directory_path = os.path.join(directory_path, new_directory_name)
            os.rename(old_directory_path, new_directory_path)
            
            # Log the directory renaming
            log_change(old_directory_path, f'Renamed directory to {new_directory_name}')
        except ValueError:
            # Log directories that do not match the expected format
            log_change(directory, 'Skipped renaming due to unexpected format')

split_xml_files(directory_path)
rename_directories(directory_path)

In [None]:
#Do some basic cleaning

# Function to remove content between xml tags
def remove_specified_content(text, file_path):
    original_text = text
    # Regular expression to match xml tags and their content, including the XML header and the <hansard> opening tag
    patterns = [
        re.compile(r'<\?xml.*?\?>', re.DOTALL),  # To remove the XML header
        re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL),
        re.compile(r'<tablecontents>.*?</tablecontents>', re.DOTALL),
        re.compile(r'<titlepage>.*?</titlepage>', re.DOTALL),
        re.compile(r'<houselords>.*?</houselords>', re.DOTALL),
        re.compile(r'<col>.*?</col>', re.DOTALL),  # To remove <col></col> tags and their content
        re.compile(r'<image src="[^"]*"/>', re.DOTALL),  # To remove <image src="..."/> tags
        re.compile(r'<hansard.*?>', re.DOTALL)  # Add this line to remove the <hansard> opening tag with its attributes
    ]
    for pattern in patterns:
        text = pattern.sub('', text)
    # Log the change if modification occurred
    if original_text != text:
        log_change(file_path, 'Removed specified content (XML header/frontmatter/tablecontents/titlepage/houselords/col/image/hansard opening tag)')
    return text

# Traverse the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if the file is an XML file
        if file.endswith('.xml'):
            file_path = os.path.join(root, file)
            # Open and read the file
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Remove content between <frontmatter></frontmatter> and <tablecontents></tablecontents> tags
            cleaned_content = remove_specified_content(content, file_path)
            # Write the cleaned content back to the file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)