# Preprocessing Sidrac

The Sidrac is a prose text and needed some extra preprocessing. This notebook is not really relevant for the analysis but it is included to be complete.

In [None]:
import xml.etree.ElementTree as ET

def extract_text_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    texts = []

    # Recursive function to extract all text within <text> elements
    def extract_text(element):
        if element.tag == "text":
            texts.append(element.text.strip() if element.text else "")
        for child in element:
            if child.tail:
                texts.append(child.tail.strip())

    extract_text(root)

    return "\n".join(t for t in texts if t)  # Join all extracted texts

# Usage example
xml_file = "../sidrac/sidrac_cdrom.xml"  # Replace with your XML file path
extracted_text = extract_text_from_xml(xml_file)
print(extracted_text)

#l p head hi

In [37]:
import os
import pandas as pd
from lxml import etree
from tqdm import tqdm
import re

# Function to process XML files
def process_xml_to_tsv(xml_filepath, tsv_filepath):
    """Extract tokens from an XML file and save them to a TSV file with empty 'pos' and 'lemma' columns."""
    tree = etree.parse(xml_filepath)

    words = []
    previous_verse = None  # To track the last added verse

    # List of elements to process in order of appearance
    elements_to_extract = ['l', 'p', 'head', 'hi']

    # Iterate over the XML tree while preserving element order
    for element in tree.iter():
        if element.tag in elements_to_extract:
            # Extract all text content from the element, including nested text
            full_text = ''.join(element.itertext()).strip()

            # Remove tokens like (B:) where B can be any combination of letters
            cleaned_text = re.sub(r'\([A-Za-z]+:\)', '', full_text)

            # Normalize spaces to avoid issues with trailing/extra spaces
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

            # Remove punctuation after eliminating specific tokens
            cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)

            # Check if cleaned text is not empty
            if cleaned_text:
                # Skip if the new verse is identical to the previous one
                if cleaned_text == previous_verse:
                    continue  # Skip duplicate verse
                
                # Update the previous verse tracker
                previous_verse = cleaned_text

                # Split into individual words
                tokens = cleaned_text.split()
                for token in tokens:
                    # Skip words containing digits
                    if not re.search(r'\d', token):
                        words.append([token, "", ""])  # Add empty 'pos' and 'lemma'
                # Add an end-of-segment marker for each element
                words.append(["$", "", ""])

    # Convert to DataFrame and save as TSV
    df = pd.DataFrame(words, columns=["word", "pos", "lemma"])
    df.to_csv(tsv_filepath, index=False, header=True, encoding='utf-8', sep="\t")
    print(df.head(20))

# Run the function
process_xml_to_tsv('../sidrac/sidrac_cdrom.xml', '../sidrac/sidrac_clean_double.tsv')

         word pos lemma
0    PROLOGHE          
1           $          
2       Dicke          
3      hebbic          
4         die          
5        gene          
6   bescouden          
7           $          
8         Die          
9         hem          
10        ane          
11        die          
12      boeke          
13     houden          
14          $          
15       Daer          
16         sy          
17      clene          
18    profijt          
19       inne          


Use galahad now

In [54]:
import pandas as pd
from lxml import etree
from tqdm import tqdm

# Define input and output file paths
input_file = "../sidrac/sidrac_galahad.tsv"
output_file = "../sidrac/sidrac.xml"

# Function to convert TSV to XML
def convert_tsv_to_xml(tsv_filepath, xml_filepath):
    """Convert a TSV file into an XML document."""
    df = pd.read_csv(tsv_filepath, sep="\t", dtype=str).fillna("")  # Read TSV, fill empty values with ""

    root = etree.Element("text", id="sidrac")  # Root element <text id="sidrac">

    line_tokens, line_lemmas, line_words = [], [], []  # Buffers for line data
    line_number = 1  # Verse/line counter

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing words"):
        form, pos, lemma = row["form"], row["pos"], row["lemma"]  # Extract columns

        if form == "$":  # End of a verse → Create <l> element
            if line_tokens:
                l_element = etree.SubElement(
                    root, "l",
                    n=str(line_number),
                    tokens=" ".join(line_tokens),
                    lemmas=" ".join(line_lemmas)
                )
                for w in line_words:
                    l_element.append(w)

                # Reset buffers for the next verse
                line_tokens, line_lemmas, line_words = [], [], []
                line_number += 1
        else:
            # Create <w> element
            w_element = etree.Element("w")
            form_el = etree.SubElement(w_element, "form")
            form_el.text = form

            lemma_el = etree.SubElement(w_element, "lemma")
            lemma_el.text = lemma

            pos_el = etree.SubElement(w_element, "pos")
            pos_el.text = pos

            # Append word data
            line_tokens.append(form)
            line_lemmas.append(lemma)
            line_words.append(w_element)

    # Save XML file
    tree = etree.ElementTree(root)
    tree.write(xml_filepath, encoding="utf-8", xml_declaration=True, pretty_print=True)

    print(f"Conversion complete! XML file saved as: {xml_filepath}")

# Run conversion
convert_tsv_to_xml(input_file, output_file)

Processing words: 100%|████████████████| 91211/91211 [00:01<00:00, 78626.09it/s]


Conversion complete! XML file saved as: ../sidrac/sidrac.xml
