In [17]:
#Do some basic cleaning

import os
import re
from datetime import datetime

# Define the directory path
directory_path = 'xml/hofcoms'

# Log file path
log_file_path = 'modification_log.txt'

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(directory_path, '1901_index.xml'))

# Function to remove content between xml tags
def remove_specified_content(text, file_path):
    original_text = text
    # Regular expression to match xml tags and their content, including the XML header and the <hansard> opening tag
    patterns = [
        re.compile(r'<\?xml.*?\?>', re.DOTALL),  # To remove the XML header
        re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL),
        re.compile(r'<tablecontents>.*?</tablecontents>', re.DOTALL),
        re.compile(r'<titlepage>.*?</titlepage>', re.DOTALL),
        re.compile(r'<houselords>.*?</houselords>', re.DOTALL),
        re.compile(r'<col>.*?</col>', re.DOTALL),  # To remove <col></col> tags and their content
        re.compile(r'<image src="[^"]*"/>', re.DOTALL),  # To remove <image src="..."/> tags
        re.compile(r'<hansard.*?>', re.DOTALL)  # Add this line to remove the <hansard> opening tag with its attributes
    ]
    for pattern in patterns:
        text = pattern.sub('', text)
    # Log the change if modification occurred
    if original_text != text:
        log_change(file_path, 'Removed specified content (XML header/frontmatter/tablecontents/titlepage/houselords/col/image/hansard opening tag)')
    return text

# Traverse the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if the file is an XML file
        if file.endswith('.xml'):
            file_path = os.path.join(root, file)
            # Open and read the file
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Remove content between <frontmatter></frontmatter> and <tablecontents></tablecontents> tags
            cleaned_content = remove_specified_content(content, file_path)
            # Write the cleaned content back to the file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

In [18]:
import re
import os
import glob

def split_files_in_directory(directory_path):
    for xml_file_path in glob.glob(os.path.join(directory_path, '*.xml')):
        base_name = os.path.splitext(os.path.basename(xml_file_path))[0]
        output_directory = os.path.join(directory_path, f"{base_name}_split")
        split_file_by_date(xml_file_path, output_directory)

def split_file_by_date(xml_file_path, output_directory):
    os.makedirs(output_directory, exist_ok=True)
    
    with open(xml_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    date_pattern = re.compile(r'<date format="([^"]+)">([^<]+)</date>')
    adjourned_pattern = re.compile(r'<p>Adjourned at.*?</p>')
    
    date_indices = {}
    start = 0
    adjourned_found = False
    
    matches = list(date_pattern.finditer(content))
    for i, match in enumerate(matches):
        if adjourned_found:
            break
        
        date, date_text = match.groups()
        end = match.end()
        
        if i + 1 < len(matches):
            next_start = matches[i + 1].start()
        else:
            next_start = len(content)
        
        if adjourned_pattern.search(content, end, next_start):
            adjourned_found = True
            log_discarded_content(content[next_start:], xml_file_path, output_directory)
            break
        
        if date not in date_indices:
            date_indices[date] = len(date_indices) + 1
        
        file_content = content[end:next_start]
        file_name = f"{date_indices[date]}. {date_text}.txt"
        with open(os.path.join(output_directory, file_name), 'w' if date not in date_indices else 'a', encoding='utf-8') as file:
            file.write(file_content)
        
        start = next_start
    
def log_discarded_content(discarded_content, xml_file_path, output_directory):
    log_file_name = os.path.splitext(os.path.basename(xml_file_path))[0] + "_discarded.log"
    with open(os.path.join(output_directory, log_file_name), 'w', encoding='utf-8') as log_file:
        log_file.write(discarded_content)

# Ensure you define `directory_path` with the path to your directory containing XML files
# split_files_in_directory(directory_path)