In [None]:
import os
import re
from datetime import datetime

# Define the directory path
directory_path = 'xml/hofcoms'

# Log file path
log_file_path = 'modification_log.txt'

# Function to delete a specific file and log the deletion
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        log_change(file_path, 'Deleted file')
    else:
        log_change(file_path, 'File not found for deletion')

# Delete the 1901_index.xml file before processing others
delete_file(os.path.join(directory_path, '1901_index.xml'))

# Function to remove content between <frontmatter></frontmatter> tags
def remove_frontmatter_content(text, file_path):
    original_text = text
    # Regular expression to match <frontmatter></frontmatter> and its content
    frontmatter_re = re.compile(r'<frontmatter>.*?</frontmatter>', re.DOTALL)
    modified_text = frontmatter_re.sub('', text)
    # Log the change if modification occurred
    if original_text != modified_text:
        log_change(file_path, 'Removed content between <frontmatter></frontmatter> tags')
    return modified_text

# Function to log changes
def log_change(file_path, change_description):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file_path, 'a', encoding='utf-8') as log_file:
        log_file.write(f"{timestamp}: {change_description} in {file_path}\n")

# Traverse the directory
for root, dirs, files in os.walk(directory_path):
    for file in files:
        # Check if the file is an XML file
        if file.endswith('.xml'):
            file_path = os.path.join(root, file)
            # Open and read the file
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Remove content between <frontmatter></frontmatter> tags
            cleaned_content = remove_frontmatter_content(content, file_path)
            # Write the cleaned content back to the file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)