**SpaCy** is a Python library for Natural Language Processing (NLP). It uses statistical models based on neural networks. These models are already trained on large corpora (texts) for:
- **Tokenize**: it divides the text into tokens respecting language rules and dictionary.
- Find speech parts (**POS tagging**): Assign a grammatical label to each word using statistical models.
- Analyze grammatical dependencies (**Parsing**): Create a grammatical dependency tree (who depends on whom).
- Recognise entities (**NER**): It detects sequences of tokens that correspond to entities (e.g. people, places).
Internally it uses models like Convolutional Neural Networks (CNN).

Documentation: https://spacy.io/usage/projects/

In [None]:
!nvcc --version


In [None]:
!pip install --upgrade spacy
!pip install --upgrade spacy[cuda111,transformers]
!pip install jsonlines
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

In [None]:
from tqdm.autonotebook import tqdm
import re

import spacy
from spacy import displacy

In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
with open("txt/portrait_of_a_Period.txt", "r") as f:
    articles = f.read()

print(len(articles))

This function extracts all PERSON entities from the document and corrects any names that end in possessive form (e.g., 's) by removing the final part. This ensures that names like 'Stefan Zweig' and 'Stefan Zweig’s' are treated as the same entity. Additionally, the function filters results to include only those names that begin with an uppercase letter, reducing noise from incorrect or generic matches.

In [None]:
def filter_person(doc):
    filtered_spans = []
    invalid_chars = re.compile(r"[^a-zA-Z\s]")
    for ent in doc.ents:
        if ent.label_ != "PERSON":
            continue

        ent_text = ent.text.strip()

        first_alpha = next((c for c in ent_text if c.isalpha()), None)
        if not first_alpha or not first_alpha.isupper():
            continue

        if invalid_chars.search(ent_text):
            continue

        if ent_text.endswith("'s"):
            span = Span(doc, ent.start, ent.end - 1, label=ent.label_)
            filtered_spans.append(span)
        else:
            filtered_spans.append(ent)

    return filtered_spans

In [None]:
def match_full_and_partial_names(name_dict):
    matches = []
    items = list(name_dict.items())

    for i, (id1, name1) in enumerate(items):
        name1_parts = name1.split()

        for j, (id2, name2) in enumerate(items):
            if id1 == id2:
                continue

            # Se name2 è una delle parti di name1 (es. "Zweig" in "Stefan Zweig")
            if name2 in name1_parts:
                matches.append((id1, id2))  # id1 ha il nome completo, id2 solo il cognome

    return matches


In [None]:
doc = nlp(articles)

In [None]:
filtered_names = filter_person(doc)

In [None]:
doc.ents = filtered_names

Graph display where each word is linked to another according to the grammatical structure (e.g. subject, object, main verb), with arrows indicating the directions of the dependencies.

In [None]:
displacy.render(doc, style="dep", jupyter=True, options={'distance': 140})

In [None]:
displacy.render(doc, style="ent", jupyter=True)

In [None]:
persons = sorted(set(ent.text for ent in doc.ents if ent.label_ == "PERSON"))

In [None]:
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
from xml.dom.minidom import Document

In [None]:
def generate_id(name):
    parts = name.strip().split()
    if len(parts) >= 2:
        return (parts[0][0] + parts[1][0]).upper()
    else:
        return parts[0][:3].upper()

In [None]:
person_ids = {}
used_ids = set()

for person in persons:
    base_id = generate_id(person)
    if base_id not in used_ids:
        person_ids[person] = base_id
        used_ids.add(base_id)
    else:
        person_ids[person] = base_id

In [None]:
def annotate_text(text, names):
    annotated = text
    placeholder_map = {}

    # Primo passaggio: sostituisci nomi completi con placeholder univoci
    for i, name in enumerate(sorted(names, key=len, reverse=True)):
        pattern = re.escape(name)
        placeholder = f"__PERSON_{i}__"
        placeholder_map[placeholder] = f'<name type="person">{name}</name>'
        annotated = re.sub(rf'(?<!\w){pattern}(?!\w)', placeholder, annotated)

    # Secondo passaggio: sostituisci placeholder con tag XML
    for placeholder, tag in placeholder_map.items():
        annotated = annotated.replace(placeholder, tag)

    return annotated

In [None]:
annotated_text = annotate_text(articles, person_ids.keys())

In [None]:
print(annotated_text)

In [None]:
with open("annotated_txt.xml", "w", encoding="utf-8") as out_file:
    out_file.write(annotated_text)

In [None]:
used_ids = set()

def unique_id(name):
    base = name[0].upper() + name[1].upper() + name[2].upper()
    candidate = base
    used_ids.add(candidate)
    return candidate

In [None]:
# Filtro nome e cognome
def filter_partial_names(person_list):
    full_names = set()
    partials_to_remove = set()

    normalized = [p.strip() for p in person_list]

    for name in normalized:
        for other in normalized:
            if name != other and name in other.split() and len(other.split()) > 1:
                partials_to_remove.add(name)
                break

    return [name for name in normalized if name not in partials_to_remove]

In [None]:
doc_xml = Document()
list_elem = doc_xml.createElement("list")
persons = filter_partial_names(persons)

for person in sorted(persons):

    item = doc_xml.createElement("item")
    xml_id = unique_id(person)
    item.setAttribute("xml:id", xml_id)

    name_elem = doc_xml.createElement("name")
    name_elem.setAttribute("type", "person")
    name_text = doc_xml.createTextNode(person)
    name_elem.appendChild(name_text)

    item.appendChild(name_elem)
    list_elem.appendChild(item)

doc_xml.appendChild(list_elem)

In [None]:
doc_xml = Document()
list_elem = doc_xml.createElement("list")

for person in sorted(persons):
    item = doc_xml.createElement("item")
    xml_id = unique_id(person)
    item.setAttribute("xml:id", xml_id)

    name_elem = doc_xml.createElement("name")
    name_elem.setAttribute("type", "person")
    name_text = doc_xml.createTextNode(person)
    name_elem.appendChild(name_text)

    item.appendChild(name_elem)
    list_elem.appendChild(item)

doc_xml.appendChild(list_elem)

In [None]:
tei = Element('TEI')
teiHeader = SubElement(tei, 'teiHeader')
text_elem = SubElement(tei, 'text')
back = SubElement(text_elem, 'back')
listPerson = SubElement(back, 'listPerson')

In [None]:
print(doc_xml.toprettyxml(indent="  "))