In [1]:
import json
from pathlib import Path
from bs4 import BeautifulSoup
import re
import pandas as pd
import xmltodict

import spacy

nlp = spacy.load("fr_core_news_sm")
nlp.max_length = 5000000


original_folder = Path("textes Codif")
output_folder = Path("forTXM")
output_folder.mkdir(exist_ok=True)


def replace_text(e):
    if isinstance(e, str):
        return str_to_xml(e)
        # raise ValueError
    to_update = {}
    for k, v in e.items():
        if isinstance(v, str):
            if k == "#text":
                to_update = str_to_xml(v)
        elif isinstance(v, dict):
            e[k] = replace_text(v)
        elif isinstance(v, list):
            print(v)
            e[k] = [replace_text(x) for x in v]
        else:
            raise ValueError
    if "#text" in e:
        del e["#text"]
    e.update(to_update)
    return e

def str_to_xml(text:str) -> dict:
    doc = nlp(text)
    # new_text = []
    # for token in doc:
    #     new_token = soup.new_tag("w", pos=token.pos_, dep=token.dep_, lemma=token.lemma_, entity=token.ent_type_)
    #     new_token.string = token.text
    #     new_text.append(xmltodict.parse(new_token))
    # return new_text
    return {
        "w": [
            {
                "@pos": token.pos_,
                "@dep": token.dep_,
                "@lemma": token.lemma_,
                "@entity": token.ent_type_,
                "#text": token.text
            }
            for token in doc
        ]
    }


In [2]:
test = original_folder.glob("*.xml").__next__()
print(test)

with test.open("r", encoding="utf-8") as f:
    text = f.read()
    text = re.sub(r"(\s)\1+", r"\g<1>", text).strip()

    with open("test.xml", "w", encoding="utf-8") as g:
        g.write(text)

    soup = BeautifulSoup(text, "xml")

text_d = xmltodict.parse(text, attr_prefix="@")
texte = text_d["TEI"]["text"]

# text_d


textes Codif\BerPet.xml


{'TEI': {'@xmlns': 'http://www.tei-c.org/ns/1.0',
  'teiHeader': {'fileDesc': {'titleStmt': {'title': '"Le Petit Café", par Tristan Bernard.'},
    'editionStmt': {'edition': 'Université Sorbonne Nouvelle',
     'respStmt': [{'name': 'Emma Molinier', 'resp': 'transcription'},
      {'name': 'Emma Molinier', 'resp': 'édition XML-TEI'}]},
    'publicationStmt': {'publisher': 'Université Sorbonne Nouvelle',
     'date': {'@when': '2021'}},
    'sourceDesc': {'p': 'Pièce éditée d\'après "Le Petit Café. Comédie en trois actes", Paris, Librairie théâtrale, artistique\n et littéraire, 1912.'}}},
  'text': {'front': {'div': {'p': 'LE PETIT CAFÉ COMÉDIE EN TROIS ACTES Représentée pour la première fois, le 12 octobre 1911, au\n Théâtre du Palais-Royal.'},
    'castList': {'head': 'PERSONNAGES',
     'castItem': [{'@xml:id': 'philibert', '#text': 'PHILIBERT'},
      {'@xml:id': 'albert', '#text': 'ALBERT'},
      {'@xml:id': 'veauchenu', '#text': 'VEAUCHENU'},
      {'@xml:id': 'bigredon', '#text

In [3]:
replace_text(texte)

[{'@xml:id': 'philibert', '#text': 'PHILIBERT'}, {'@xml:id': 'albert', '#text': 'ALBERT'}, {'@xml:id': 'veauchenu', '#text': 'VEAUCHENU'}, {'@xml:id': 'bigredon', '#text': 'BIGREDON'}, {'@xml:id': 'général', '#text': 'LE GÉNÉRAL DE KERKOADEC'}, {'@xml:id': 'plongeur', '#text': 'LE PLONGEUR'}, {'@xml:id': 'plouvier', '#text': 'PLOUVIER'}, {'@xml:id': 'xavier', '#text': 'XAVIER'}, {'@xml:id': 'journaliste', '#text': 'LE JOURNALISTE'}, {'@xml:id': 'gastonnet', '#text': 'GASTONNET'}, {'@xml:id': 'pézard', '#text': 'PÉZARD'}, {'@xml:id': 'arthur', '#text': 'ARTHUR'}, {'@xml:id': 'facteur', '#text': 'LE FACTEUR'}, {'@xml:id': 'gérant', '#text': 'LE GÉRANT'}, {'@xml:id': 'huissier', '#text': "L'HUISSIER"}, {'@xml:id': 'bouzin', '#text': 'BOUZIN'}, {'@xml:id': 'garçon', '#text': 'UN GARÇON'}, {'@xml:id': 'sommelier', '#text': 'LE SOMMELIER'}, {'@xml:id': 'jabert', '#text': 'JABERT'}, {'@xml:id': 'bérengère', '#text': "BÉRENGÈRE D'AQUITAINE"}, {'@xml:id': 'edwige', '#text': 'EDWIGE'}, {'@xml:id

{'front': {'div': {'p': 'LE PETIT CAFÉ COMÉDIE EN TROIS ACTES Représentée pour la première fois, le 12 octobre 1911, au\n Théâtre du Palais-Royal.'},
  'castList': {'head': 'PERSONNAGES',
   'castItem': [{'@xml:id': 'philibert',
     'w': [{'@pos': 'NOUN',
       '@dep': 'ROOT',
       '@lemma': 'philibert',
       '@entity': 'ORG',
       '#text': 'PHILIBERT'}]},
    {'@xml:id': 'albert',
     'w': [{'@pos': 'NOUN',
       '@dep': 'ROOT',
       '@lemma': 'albert',
       '@entity': 'MISC',
       '#text': 'ALBERT'}]},
    {'@xml:id': 'veauchenu',
     'w': [{'@pos': 'PROPN',
       '@dep': 'ROOT',
       '@lemma': 'VEAUCHENU',
       '@entity': 'MISC',
       '#text': 'VEAUCHENU'}]},
    {'@xml:id': 'bigredon',
     'w': [{'@pos': 'PROPN',
       '@dep': 'ROOT',
       '@lemma': 'BIGREDON',
       '@entity': '',
       '#text': 'BIGREDON'}]},
    {'@xml:id': 'général',
     'w': [{'@pos': 'DET',
       '@dep': 'det',
       '@lemma': 'le',
       '@entity': '',
       '#text': 'LE'},

In [4]:
texte

{'front': {'div': {'p': 'LE PETIT CAFÉ COMÉDIE EN TROIS ACTES Représentée pour la première fois, le 12 octobre 1911, au\n Théâtre du Palais-Royal.'},
  'castList': {'head': 'PERSONNAGES',
   'castItem': [{'@xml:id': 'philibert',
     'w': [{'@pos': 'NOUN',
       '@dep': 'ROOT',
       '@lemma': 'philibert',
       '@entity': 'ORG',
       '#text': 'PHILIBERT'}]},
    {'@xml:id': 'albert',
     'w': [{'@pos': 'NOUN',
       '@dep': 'ROOT',
       '@lemma': 'albert',
       '@entity': 'MISC',
       '#text': 'ALBERT'}]},
    {'@xml:id': 'veauchenu',
     'w': [{'@pos': 'PROPN',
       '@dep': 'ROOT',
       '@lemma': 'VEAUCHENU',
       '@entity': 'MISC',
       '#text': 'VEAUCHENU'}]},
    {'@xml:id': 'bigredon',
     'w': [{'@pos': 'PROPN',
       '@dep': 'ROOT',
       '@lemma': 'BIGREDON',
       '@entity': '',
       '#text': 'BIGREDON'}]},
    {'@xml:id': 'général',
     'w': [{'@pos': 'DET',
       '@dep': 'det',
       '@lemma': 'le',
       '@entity': '',
       '#text': 'LE'},

In [5]:
text_d["TEI"]["text"] = texte

In [6]:
with open("test.json", "w", encoding="utf-8") as f:
    json.dump(text_d, f, ensure_ascii=False, indent=4)

In [7]:
text_d["TEI"]["teiHeader"]["fileDesc"].keys()

dict_keys(['titleStmt', 'editionStmt', 'publicationStmt', 'sourceDesc'])

In [11]:
res = xmltodict.unparse(text_d, pretty=True)

In [12]:
with open("test/test.xml", "w", encoding="utf-8") as f:
    f.write(res)

text = soup.find("text")


for e in text.findAll():
    if e.findAll() != []:
        continue
    if e.text:
        # print(e.name)
        if e.name in {"castList", "castItem", "sp", "speaker", "body"}:
            continue

        # print(e.text)
        name = e.name
        doc = nlp(e.text)
        new_text = []
        for token in doc:
            new_token = soup.new_tag("w", pos=token.pos_, dep=token.dep_, lemma=token.lemma_, entity=token.ent_type_)
            new_token.string = token.text
            new_text.append(new_token)
        e.replaceWith(*new_text)
        # print(e)
        # 1/0





with open(output_folder / test.name, "w", encoding="utf-8") as f:
    f.write(str(soup))


text


e.name


e.findAll()


to_iter = [texte]
while True:
    for e in to_iter:
        for k, v in e.items():
            if isinstance(v, str):
                if k not in {"castList", "castItem", "sp", "speaker", "body"}:
                    doc = nlp(v)
                    new_text = []
                    for token in doc:
                        new_token = soup.new_tag("w", pos=token.pos_, dep=token.dep_, lemma=token.lemma_, entity=token.ent_type_)
                        new_token.string = token.text
                        new_text.append(new_token)
                    e[k] = new_text

        if isinstance(e, dict):
            to_iter.append(e)
        elif isinstance(e, list):
            to_iter.extend(e)
        else:
            print(e)
            continue
        to_iter.remove(e)
    if not to_iter:
        break