#  Turn span classification output into a TXM compliant format


This notebook 

The NER model is available on HuggingFace: 
https://huggingface.co/GEODE/camembert-base-edda-span-classification


Domains (Géographie, Histoire, etc.):
https://huggingface.co/GEODE/bert-base-multilingual-cased-edda-domain-classification

Geography Entry Classification (Place, Person, Misc):
https://huggingface.co/GEODE/bert-base-multilingual-cased-geography-entry-classification

Place Entry Classification (Single/Multiple):
https://huggingface.co/GEODE/bert-base-multilingual-cased-single-multiple-place-classification

Place/Single Entry Classification (Ville, Rivière, Pays, etc.):
https://huggingface.co/GEODE/bert-base-multilingual-cased-place-entry-classification


In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import spacy
from utils import get_teicorpus_header, spacy_to_xml, merge_annotations, get_teicorpus_footer
from lxml import etree

device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))

In [9]:
def run(content, meta, pipe_spacy, pipe_ner, pipe_domain, pipe_geo, pipe_place, pipe_single_place, teiCorpus=False):

    if teiCorpus:
        print(" -> TEI corpus header")
        xml_content = get_teicorpus_header({'book': meta['book'], 'tome': meta['tome']})
    else:
        print(" -> no TEI corpus header")
        xml_content = "" #<?xml version="1.0" encoding="UTF-8"?>

    print(" -> domain classification")
    meta['domains'] = ":" + pipe_domain(content)[0]['label'] + ":"

    if meta['domains'] == ":Géographie:":
        print(" -> geography article classification")
        meta['geoType'] = pipe_geo(content)[0]['label']

        if meta['geoType'] == "Place":
            print(" -> place classification")
            #if pipe_place(content)[0]['label'] == "Single":
                #print(" -> place & single classification")
            meta['placeType'] = pipe_single_place(content)[0]['label']

    print(" -> spaCy: POS tagging and lemmatisation")
    doc = pipe_spacy(content)

    print(" -> BERT NER")
    annotations = pipe_ner(content)

    print(" -> to XML-TEI (TXM compliant)")
    xml_content += spacy_to_xml(doc, meta)

    if teiCorpus:
        xml_content += get_teicorpus_footer()

    print(" -> Merging annotations")
    return merge_annotations(etree.fromstring(xml_content), annotations)

In [14]:
pipe_spacy = spacy.load("fr_dep_news_trf")
pipe_ner = pipeline("token-classification", model="GEODE/camembert-base-edda-span-classification", aggregation_strategy="simple", device=device)
pipe_domain = pipeline("text-classification", 
                model=AutoModelForSequenceClassification.from_pretrained("GEODE/bert-base-multilingual-cased-edda-domain-classification"), 
                tokenizer=AutoTokenizer.from_pretrained("GEODE/bert-base-multilingual-cased-edda-domain-classification"), 
                truncation=True, device=device)
pipe_geo = pipeline("text-classification", 
                model=AutoModelForSequenceClassification.from_pretrained("GEODE/bert-base-multilingual-cased-geography-entry-classification"), 
                tokenizer=AutoTokenizer.from_pretrained("GEODE/bert-base-multilingual-cased-geography-entry-classification"), 
                truncation=True, device=device)
pipe_place = pipeline("text-classification", 
                model=AutoModelForSequenceClassification.from_pretrained("GEODE/bert-base-multilingual-cased-single-multiple-place-classification"), 
                tokenizer=AutoTokenizer.from_pretrained("GEODE/bert-base-multilingual-cased-single-multiple-place-classification"), 
                truncation=True, device=device)
pipe_single_place = pipeline("text-classification", 
                model=AutoModelForSequenceClassification.from_pretrained("GEODE/bert-base-multilingual-cased-place-entry-classification"), 
                tokenizer=AutoTokenizer.from_pretrained("GEODE/bert-base-multilingual-cased-place-entry-classification"), 
                truncation=True, device=device)


content = "* ALBI, (Géog.) ville de France, capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44."
meta = {
    'head': 'ALBI 2',
    'author': ':Diderot:',
    'uid': 'EDdA_2_1567',
    'book': 'EDdA',
    'tome': '2',
    'domains': 'None',
    'geoType': 'None',
    'placeType': 'None',
}

xml = run(content, meta, pipe_spacy, pipe_ner, pipe_domain, pipe_geo, pipe_place, pipe_single_place)


 -> no TEI corpus header
 -> domain classification
 -> geography article classification
 -> place classification
 -> spaCy: POS tagging and lemmatisation
 -> BERT NER
 -> to XML-TEI (TXM compliant)
 -> Merging annotations


In [15]:
pretty_print = etree.tostring(xml, pretty_print=True, encoding='unicode')
print(pretty_print)

<TEI>
  <teiHeader>
    <fileDesc>
      <titleStmt>
        <title>ALBI 2</title>
      </titleStmt>
      <publicationStmt>
        <p>Annotated with Spacy (fr_dep_news_trf) and https://huggingface.co/GEODE/camembert-base-edda-span-classification by project GEODE</p>
      </publicationStmt>
      <sourceDesc>
        <bibl>
          <author>Diderot</author>
        </bibl>
      </sourceDesc>
    </fileDesc>
  </teiHeader>
  <text author=":Diderot:" uid="EDdA_2_1567" book="EDdA" tome="2" domains=":Géographie:" geoType="Place" placeType="Ville">
    <body>
      <s>
        <w lemma="*" pos="PUNCT" start="0" end="1">*</w>
        <w lemma="ALBI" pos="PROPN" start="2" end="6" type="S-Head">ALBI</w>
        <w lemma="," pos="PUNCT" start="6" end="7">,</w>
        <w lemma="(" pos="PUNCT" start="8" end="9" type="B-Domain_mark">(</w>
        <w lemma="Géog" pos="PROPN" start="9" end="13" type="I-Domain_mark">Géog</w>
        <w lemma="." pos="PUNCT" start="13" end="14" type="E-Domain_ma

In [17]:
# save to file
with open(f"{meta['uid']}.xml", "w") as f:
    f.write(pretty_print)

In [None]:
# teiCorpus / ou un article par fichier TEI ?

