In [6]:
from lxml import etree

def extract_text_with_structure(tei_file):
    # Load and parse the TEI XML file
    tree = etree.parse(tei_file)
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}  # Define the TEI namespace

    structured_text = []

    # Extracting document title
    doc_title = tree.xpath('//tei:titleStmt/tei:title/text()', namespaces=ns)
    if doc_title:
        structured_text.append("Document Title: " + doc_title[0] + "\n")

    # Extracting abstract
    abstracts = tree.xpath('//tei:abstract//text()', namespaces=ns)
    if abstracts:
        structured_text.append("Abstract:\n" + "".join(abstracts).strip() + "\n")

    # Extracting sections with headers and paragraphs
    # Assuming sections are under <body> tag
    sections = tree.xpath('//tei:text//tei:body//tei:div', namespaces=ns)
    for section in sections:
        # Section title
        header = section.xpath('.//tei:head/text()', namespaces=ns)
        if header:
            structured_text.append("\n" + header[0] + "\n")

        # Section paragraphs
        paragraphs = section.xpath('.//tei:p//text()', namespaces=ns)
        for paragraph in paragraphs:
            structured_text.append(paragraph.strip() + "\n")

    # Convert the list of strings into a single string
    return "\n".join(structured_text)

# The path to your TEI XML file from Grobid
tei_file = '/Users/shubh/Desktop/untitled folder/GROBID/Grobid_RR_2024_LevelIII_combined.xml'

# Extract the text while preserving structure
structured_text = extract_text_with_structure(tei_file)

# Output the structured text to a new file
output_file_path = '/Users/shubh/Desktop/untitled folder/GROBID/Grobid_RR_2024_LevelIII_combined.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write(structured_text)

print(f"Structured text extraction completed. The text has been saved to: {output_file_path}")


Structured text extraction completed. The text has been saved to: /Users/shubh/Desktop/untitled folder/GROBID/Grobid_RR_2024_LevelIII_combined.txt
