In [1]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
from datetime import datetime

# Set the path to the Excel file
excel_file = r"C:/Documents/Renaming XML Files/NM Speeches-XML-Cleaning (Merged Transcripts V4).xlsx"

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file)

# Create the output folder
output_folder = "new xml"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Get the filename and text from the row
    filename = row['File']
    text = row['p']
    date_str = row['File'][:10]  # Get the first 10 characters of the filename as the date
    
    # Extract the digits from the filename as the term key
    term_key = filename.split('_')[-1].split('.')[0]
    
    try:
        # Parse the date string into a datetime object
        date = datetime.strptime(date_str, "%Y-%m-%d")
        formatted_date = date.strftime("%d %B %Y")  # Format the date as "18 July 1874"
    except ValueError:
        print(f"Skipping row {index + 2} due to invalid date format: {date_str}")
        continue
    
    # Create the XML structure
    root = ET.Element("speech")
    date_elem = ET.SubElement(root, "date", when=date_str)
    date_elem.text = formatted_date
    text_elem = ET.SubElement(root, "text")
    term_elem = ET.SubElement(text_elem, "term", key=term_key)
    body_elem = ET.SubElement(text_elem, "body")
    body_elem.text = text
    
    # Write the XML file to disk
    xml_file = os.path.join(output_folder, filename.replace('.txt', '.xml'))
    tree = ET.ElementTree(root)
    tree.write(xml_file, encoding='utf-8', xml_declaration=True)