In [3]:
from xml.etree import ElementTree as ET
import re

def remove_references_from_text(file_path):
    # Load XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespaces = {"tei": "http://www.tei-c.org/ns/1.0"}

    # Extract references
    references = []
    for bibl in root.findall(".//tei:listBibl/tei:biblStruct", namespaces):
        ref_text = []
        for child in bibl.iter():
            if child.text:
                ref_text.append(child.text.strip())
        references.append(" ".join(ref_text))

    # Extract body text
    body_text = []
    for body in root.findall(".//tei:body", namespaces):
        for paragraph in body.findall(".//tei:p", namespaces):
            if paragraph.text:
                body_text.append(paragraph.text.strip())

    # Combine body text
    full_text = "\n".join(body_text)

    # Generate a pattern for reference citations (e.g., [1], (Author, 2008))
    citation_pattern = r"\[\d+\]|\([^\)]+\d{4}\)"

    # Remove references from the body text
    clean_text = re.sub(citation_pattern, "", full_text)

    return clean_text

# File path to the TEI XML
file_path = "paper_008.pdf.tei.xml"
clean_text = remove_references_from_text(file_path)

# Save the clean text to a file or return it
clean_text[:1000]  # Display first 1000 characters of cleaned text

'The continuous development of optimized lithium-ion batteries (LIBs) is indispensable on the way to sustainable, cost-efficient and long-range electric vehicles (EVs). Besides the quest for high power performance and good safety characteristics, increasing the energy density appears to be a prime goal in the development of next-generation LIBs.\nIn this work, we show that NCM111 and NCM811 cells provide virtually the same energy density at 4.6 and 4.3 V, respectively. Furthermore, we demonstrate that it is very important to analyze both the performance and structural degradation of different Ni content NCM cathodes for a given energy level rather than voltage range in a reference-type LIB. To allow for a reasonable performance assessment, we estimate the energy densities as a function of U max for a variety of NCMs and correlate the results with structural properties of the materials from operando X-ray diffraction.\nLi 1.02 Ni x Mn y Co z O 2 (x + y + z = 1) cathode materials with x:

In [4]:
clean_text

'The continuous development of optimized lithium-ion batteries (LIBs) is indispensable on the way to sustainable, cost-efficient and long-range electric vehicles (EVs). Besides the quest for high power performance and good safety characteristics, increasing the energy density appears to be a prime goal in the development of next-generation LIBs.\nIn this work, we show that NCM111 and NCM811 cells provide virtually the same energy density at 4.6 and 4.3 V, respectively. Furthermore, we demonstrate that it is very important to analyze both the performance and structural degradation of different Ni content NCM cathodes for a given energy level rather than voltage range in a reference-type LIB. To allow for a reasonable performance assessment, we estimate the energy densities as a function of U max for a variety of NCMs and correlate the results with structural properties of the materials from operando X-ray diffraction.\nLi 1.02 Ni x Mn y Co z O 2 (x + y + z = 1) cathode materials with x:

In [5]:
len(clean_text)

7252