# Script for xml to txt conversion

This script is used to convert the xml files to txt files. The xml files are present in the `resources\xml` folder and the txt files are saved in the `resources\txt` folder. The txt files are saved with the same name as the xml files.

### Xml partitioning

Xml files are organized in logical groups: when the xml files are converted to txt files, each txt files will contain the aggregated text from the xml files in the same group.
This allows the generation of larger txt files that can be used for testing the Workflow.

### Code

In [4]:
# Adding imports
import os
import re
import xml.etree.ElementTree as ET

In [5]:
# Path of the directory containing the XML files
xml_dir = os.path.join('..','resources', 'xml')
# Path of the directory where to save the text files
txt_dir = os.path.join('..','resources', 'input')

print(f'XML directory: {xml_dir}')
print(f'Text directory: {txt_dir}')

XML directory: ../resources/xml
Text directory: ../resources/input


In [6]:
# Clean the txt_dir directory
for filename in os.listdir(txt_dir):
    file_path = os.path.join(txt_dir, filename)
    try:
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f'{file_path} has been deleted')
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

In [7]:
# Function to extract the text from the XML file
def extract_text_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    namespace = '{http://www.mediawiki.org/xml/export-0.10/}'
    
    pages = root.findall('.//' + namespace + 'page')
    
    content = []
    for page in pages:
        text_element = page.find('.//' + namespace + 'revision/' + namespace + 'text')
        if text_element is not None:
            text = text_element.text
            # Remove the unwanted part using regex
            text = re.sub(r'<div.*?>.*?</div>', '', text, flags=re.DOTALL)
            content.append(text)
    
    print(f'{len(content)} pages have been extracted from {xml_file}')    
    return '\n\n'.join(content)

In [8]:
# Walk through xml_dir directory
for dirpath, dirnames, filenames in os.walk(xml_dir):
    for filename in filenames:
        # Check if the file has .xml extension
        if filename.endswith('.xml'):
            xml_file = os.path.join(dirpath, filename)
            # Extract the text from the XML file
            text_content = extract_text_from_xml(xml_file)
            # Create the name of the output file by using the parent directory name
            output_file = os.path.join(txt_dir, os.path.basename(dirpath) + '.txt')
            # Append the extracted text in the output file
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(text_content + '\n')

            print(f'The content of {filename} has been appended to {output_file}')

573 pages have been extracted from ../resources/xml/ita-XXI/Wikisource-20240523182141.xml
The content of Wikisource-20240523182141.xml has been appended to ../resources/input/ita-XXI.txt
