In [8]:
#Do some basic cleaning

import os
import re
from datetime import datetime

# Define the directory path
directory_path = 'xml/hofcoms'

# Log file path
log_file_path = 'modification_log.txt'

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(directory_path, '1901_index.xml'))

# Function to remove content between xml tags
def remove_specified_content(text, file_path):
    original_text = text
    # Regular expression to match xml tags and their content, including the XML header and the <hansard> opening tag
    patterns = [
        re.compile(r'<\?xml.*?\?>', re.DOTALL),  # To remove the XML header
        re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL),
        re.compile(r'<tablecontents>.*?</tablecontents>', re.DOTALL),
        re.compile(r'<titlepage>.*?</titlepage>', re.DOTALL),
        re.compile(r'<houselords>.*?</houselords>', re.DOTALL),
        re.compile(r'<col>.*?</col>', re.DOTALL),  # To remove <col></col> tags and their content
        re.compile(r'<image src="[^"]*"/>', re.DOTALL),  # To remove <image src="..."/> tags
        re.compile(r'<hansard.*?>', re.DOTALL)  # Add this line to remove the <hansard> opening tag with its attributes
    ]
    for pattern in patterns:
        text = pattern.sub('', text)
    # Log the change if modification occurred
    if original_text != text:
        log_change(file_path, 'Removed specified content (XML header/frontmatter/tablecontents/titlepage/houselords/col/image/hansard opening tag)')
    return text

# Traverse the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if the file is an XML file
        if file.endswith('.xml'):
            file_path = os.path.join(root, file)
            # Open and read the file
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Remove content between <frontmatter></frontmatter> and <tablecontents></tablecontents> tags
            cleaned_content = remove_specified_content(content, file_path)
            # Write the cleaned content back to the file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

In [None]:
# Split the files

import re
import os

def split_file_by_date(xml_file_path, output_directory):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)
    
    # Read the content of the XML file
    with open(xml_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Regular expression to find <date> tags and their content
    date_pattern = re.compile(r'<date format="([^"]+)">([^<]+)</date>')
    
    # Dictionary to keep track of dates and their indices
    date_indices = {}
    
    # Split the content based on <date> tags
    start = 0
    for match in date_pattern.finditer(content):
        date, date_text = match.groups()
        end = match.start()
        
        # Check if this date has been encountered before
        if date not in date_indices:
            date_indices[date] = len(date_indices) + 1
            file_content = content[start:end]
            file_name = f"{date_indices[date]}. {date_text}.txt"
            with open(os.path.join(output_directory, file_name), 'w', encoding='utf-8') as new_file:
                new_file.write(file_content)
        else:
            # Append to the existing file if the date is consecutive
            file_name = f"{date_indices[date]}. {date_text}.txt"
            with open(os.path.join(output_directory, file_name), 'a', encoding='utf-8') as existing_file:
                existing_file.write(content[start:end])
        
        start = match.end()
    
    # Handle the last portion of the file after the last <date> tag
    if start < len(content):
        file_content = content[start:]
        # Use the last date for the remaining content
        if date_indices:
            last_date_index = max(date_indices.values())
            last_date_text = date_text  # From the last iteration of the loop
            file_name = f"{last_date_index}. {last_date_text}.txt"
            with open(os.path.join(output_directory, file_name), 'a', encoding='utf-8') as last_file:
                last_file.write(file_content)

# Example usage
split_file_by_date('19010123_19010227.xml', 'output_directory')