In [76]:
!pip install langdetect



In [77]:
!python -m spacy download de_core_news_md
!python -m spacy download en_core_web_trf

Collecting de-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.8.0/de_core_news_md-3.8.0-py3-none-any.whl (44.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and install

In [78]:
# -*- coding: utf-8 -*-

# Named Entity Recognition and Annotation Pipeline for the Hannah Arendt Digital Edition
# This script performs the following tasks:
# 1. Detects the language of a philosophical text
# 2. Applies spaCy NER to extract 'person' entities (normalized to label 'PER')
# 3. Filters these entities using an authoritative external API
# 4. Annotates the original text with XML <name> tags
# 5. Generates a TEI-compatible <listPerson> for indexing recognized individuals

import re
import requests
from langdetect import detect
from tqdm.autonotebook import tqdm
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy
from xml.dom.minidom import Document
from xml.etree.ElementTree import Element, SubElement

In [79]:

# -- 1. TEXT PROCESSING & NAMED ENTITY EXTRACTION --
# This function loads the appropriate spaCy language model (German or English),
# detects named entities of type PERSON, converts their labels to a unified 'PER' type,
# and returns a new processed document with those entities highlighted.
def process_and_render_person_entities(text):
    lang = detect(text)[:2]  # e.g., 'de' or 'en'

    if lang == "de":
        nlp = spacy.load("de_core_news_md")
    else:
        nlp = spacy.load("en_core_web_trf")

    doc = nlp(text)

    tokens = [token.text for token in doc]
    person_ents = []
    for ent in doc.ents:
        if ent.label_ in ("PER", "PERSON"):
            new_ent = Span(doc, ent.start, ent.end, label=doc.vocab.strings["PER"])
            person_ents.append(new_ent)

    new_doc = Doc(doc.vocab, words=tokens)
    new_doc.set_ents(person_ents, default="unmodified")

    try:
        displacy.render(new_doc, style="ent", jupyter=True)
    except:
        from IPython.core.display import display, HTML
        display(HTML(displacy.render(new_doc, style="ent")))

    return new_doc, lang

In [80]:

# -- 2. API QUERYING & ENTITY VALIDATION --
# These functions retrieve a list of valid historical person names from the
# Hannah Arendt Digital Edition API and expand them into partial tokens to support partial matches.
def fetch_person_labels(api_url):
    resp = requests.get(api_url)
    data = resp.json()
    valid_names = {"en": set(), "de": set()}
    for person in data.get("persons", {}).get("person", []):
        for l in ["en", "de"]:
            name = person.get("label", {}).get("reg", {}).get(l)
            if name:
                valid_names[l].add(name)
    return valid_names

def expand_valid_names(valid_names):
    # Break full names into individual components (tokens) to enable partial matching
    expanded = {"en": set(), "de": set()}
    for lang_code, names_set in valid_names.items():
        for full_name in names_set:
            expanded[lang_code].add(full_name)
            tokens = full_name.replace(",", "").split()
            expanded[lang_code].update(tokens)
    return expanded

In [81]:
# -- 3. ENTITY FILTERING --
# This function filters detected entities by checking if they are plausible (capitalized, no punctuation)
# and whether any token matches entries in the API-sourced valid names set.
def filter_persons_by_api(doc, valid_names, lang):
    filtered_spans = []
    valid_set = valid_names.get(lang, set())
    pattern = re.compile(r"[^a-zA-ZäöüÄÖÜß\s]" if lang == "de" else r"[^a-zA-Z\s]")

    for ent in doc.ents:
        if ent.label_ != "PER":
            continue

        text = ent.text.strip()
        if text.endswith("'s"):
            text = text[:-2].strip()

        if not text or not text[0].isupper():
            continue

        if pattern.search(text):
            continue

        tokens = text.replace(",", "").split()
        if any(t in valid_set for t in tokens):
            filtered_spans.append(ent)

    return filtered_spans

In [82]:
# -- 4. TEXTUAL ANNOTATION --
# Annotate the original text by wrapping detected person names in <name type="person"> tags.
def annotate_text(text, names):
    annotated = text
    placeholder_map = {}

    for i, name in enumerate(sorted(names, key=len, reverse=True)):
        pattern = re.escape(name)
        placeholder = f"__PERSON_{i}__"
        placeholder_map[placeholder] = f'<name type="person">{name}</name>'
        annotated = re.sub(rf'(?<!\w){pattern}(?!\w)', placeholder, annotated)

    for placeholder, tag in placeholder_map.items():
        annotated = annotated.replace(placeholder, tag)

    return annotated

In [83]:
# -- 5. POST-PROCESSING OF NAME LIST --
# Remove name fragments if they are substrings of longer names (e.g., 'Marx' in 'Karl Marx').
# Also deduplicate entries in a case-insensitive manner.
def filter_partial_names(person_list):
    partials_to_remove = set()
    for name in person_list:
        for other in person_list:
            if name != other and name in other.split() and len(other.split()) > 1:
                partials_to_remove.add(name)
                break
    return [name for name in person_list if name not in partials_to_remove]

def deduplicate_names(names):
    seen = set()
    unique_names = []
    for name in names:
        key = name.lower()
        if key not in seen:
            seen.add(key)
            unique_names.append(name)
    return unique_names


In [84]:

# -- 6. GENERATE TEI LISTPERSON XML --
# Create a listPerson TEI structure for storing normalized person references.
used_ids = set()
def unique_id(name):
    base = ''.join([c for c in name if c.isalpha()])[:5].upper()
    i = 1
    candidate = base
    while candidate in used_ids:
        candidate = f"{base}{i}"
        i += 1
    used_ids.add(candidate)
    return candidate

def create_person_list_xml(persons):
    doc_xml = Document()
    list_elem = doc_xml.createElement("list")
    persons = filter_partial_names(persons)
    persons = deduplicate_names(persons)

    for person in sorted(persons):
        item = doc_xml.createElement("item")
        xml_id = unique_id(person)
        item.setAttribute("xml:id", xml_id)

        name_elem = doc_xml.createElement("name")
        name_elem.setAttribute("type", "person")
        name_text = doc_xml.createTextNode(person)
        name_elem.appendChild(name_text)

        item.appendChild(name_elem)
        list_elem.appendChild(item)

    doc_xml.appendChild(list_elem)
    return doc_xml.toprettyxml(indent="  ")

In [85]:
# -- 7. MAIN EXECUTION LOGIC --
# Coordinates the full pipeline: loading input, applying NER, validating against API,
# generating annotated text and person list output.
if __name__ == "__main__":
    with open("txt/Judenfrage.txt", "r", encoding="utf-8") as f:
        articles = f.read()

    doc, lang = process_and_render_person_entities(articles)

    valid_names = fetch_person_labels("https://hannah-arendt-edition.net/~api/index/persons/")
    expanded_valid_names = expand_valid_names(valid_names)

    filtered_ents = filter_persons_by_api(doc, expanded_valid_names, lang)
    doc.ents = filtered_ents

    filtered_names = [ent.text.strip() for ent in filtered_ents]
    print(f"\n✅ Filtered ({lang}): {len(filtered_names)} names\n", filtered_names[:20])



✅ Filtered (de): 68 names
 ['Lessing', 'Lessing', 'Saladin', 'Nathan', 'Lessing', 'Lessing', 'Lessing', 'Mendelssohn', 'Mendelssohn', 'Lessing', 'Mendelssohn', 'Lessing', 'Mendelssohn', 'Mendelssohn', 'Mendelssohn', 'Mendelssohn', 'Lessing', 'Mendelssohn', 'Dohm', 'Mendelssohn']


In [90]:
# Filter PERSON entities in the doc using valid names from the API
# Get all unique entity labels present in the doc
# Set displacy options to highlight entities in yellow
filtered_ents = filter_persons_by_api(doc, expanded_valid_names, lang)
doc.ents = filtered_ents

labels = list(set([ent.label_ for ent in doc.ents]))

options = {
    "ents": labels,
    "colors": {label: "yellow" for label in labels}
}

displacy.render(doc, style="ent", options=options, jupyter=True)



In [91]:
# Annotate the original text by wrapping filtered person names with XML <name> tags
# Generate a TEI-compatible XML list of persons from the filtered names

annotated_text = annotate_text(articles, filtered_names)
with open("annotated_txt.xml", "w", encoding="utf-8") as out:
    out.write(annotated_text)
xml_output = create_person_list_xml(filtered_names)
print(xml_output)

<?xml version="1.0" ?>
<list>
  <item xml:id="DAVID1">
    <name type="person">David Friedländer</name>
  </item>
  <item xml:id="DOHM1">
    <name type="person">Dohm</name>
  </item>
  <item xml:id="HERDE1">
    <name type="person">Herder</name>
  </item>
  <item xml:id="LAVAT1">
    <name type="person">Lavater</name>
  </item>
  <item xml:id="LESSI1">
    <name type="person">Lessing</name>
  </item>
  <item xml:id="MENDE1">
    <name type="person">Mendelssohn</name>
  </item>
  <item xml:id="MIRAB1">
    <name type="person">Mirabeau</name>
  </item>
  <item xml:id="NATHA1">
    <name type="person">Nathan</name>
  </item>
  <item xml:id="SALAD1">
    <name type="person">Saladin</name>
  </item>
  <item xml:id="SCHLE1">
    <name type="person">Schleiermacher</name>
  </item>
  <item xml:id="SOKRA1">
    <name type="person">Sokrates</name>
  </item>
  <item xml:id="TELLE1">
    <name type="person">Teller</name>
  </item>
</list>



In [88]:
# -- 7. Function to highlight occurrences of person names in the given text  --
# It uses regular expressions to find full-word matches of the names
# and replaces them with an XML-tagged version (<name type="person">...</name>).
# The entire XML markup is colored orange for console output using ANSI codes.
import re

COLOR_ORANGE = "\033[33m"
COLOR_RESET = "\033[0m"

def annotate_text(text, names):
    names = sorted(set(names), key=len, reverse=True)
    pattern = re.compile(r'\b(' + '|'.join(map(re.escape, names)) + r')\b')

    def replacer(match):
        tagged = f'<name type="person">{match.group(0)}</name>'
        # Evidenzia tutta la marcatura XML in arancione:
        return COLOR_ORANGE + tagged + COLOR_RESET

    annotated_text = pattern.sub(replacer, text)
    return annotated_text

annotated = annotate_text(articles, filtered_names)
print(annotated)


Aufklärung und Judenfrage




Die moderne Judenfrage datiert aus der Aufklärung; die Aufklärung, d. h. die nichtjüdische Welt hat sie gestellt. Ihre Formulierungen und ihre Antworten haben das Verhalten der Juden, haben die Assimilation der Juden bestimmt. Seit Mendelssohns wirklicher Assimiliertheit und seit Dohms Werk »Über die bürgerliche Verbesserung der Juden« (1781) tauchen in der Diskussion über die Emanzipation immer wieder die gleichen Argumente auf, die in [33m<name type="person">Lessing</name>[0m ihren Kronzeugen haben. Ihm verdankt sie die Propagierung von Menschlichkeit und Toleranz wie die Trennung von Vernunft- und Geschichtswahrheiten. Diese Trennung ist deshalb so überaus wichtig, weil sie die innerhalb der Geschichte zufällige Assimilation legitimieren kann; sie braucht dann nur als fortschreitende Einsicht in die Wahrheit, nicht als Angleichung und Rezeption einer bestimmten Kultur in einem bestimmten und damit zufälligen Geschichtsstadium zu erscheinen.
   Für [3

In [89]:
# -- 8. Script to query a remote API returning person data --
# and generate a TEI-like XML document describing each person
# with attributes, labels, biographical references, and authority IDs (GND).
# The XML document is printed to the console and saved to a file.

import requests
from xml.dom.minidom import Document

def create_person_xml(doc_xml, person):
    person_elem = doc_xml.createElement("person")

    label = person.get("label", {}).get("generic", "UNKNOWN")
    person_id = person.get("id", "UNKNOWN_ID")
    status = person.get("status", "unknown")
    # Ensure vols is a string
    vol = str(person.get("vols", "unknown"))
    role = person.get("role", "default")

    person_elem.setAttribute("ae:label", label)
    person_elem.setAttribute("ae:author", "MCT")  # personalizza se serve
    person_elem.setAttribute("xml:id", person_id)
    person_elem.setAttribute("ae:vol", vol)
    person_elem.setAttribute("role", role)

    label_reg_elem = doc_xml.createElement("ae-form:label")
    label_reg_elem.setAttribute("type", "name-reg")
    label_reg_elem.appendChild(doc_xml.createTextNode("Name (regular):"))
    person_elem.appendChild(label_reg_elem)

    persName_elem = doc_xml.createElement("persName")
    persName_elem.setAttribute("ae:label", label)
    persName_elem.setAttribute("type", "reg")
    persName_elem.appendChild(doc_xml.createTextNode(label))
    person_elem.appendChild(persName_elem)

    btn_persName = doc_xml.createElement("ae-form:button")
    btn_persName.setAttribute("type", "persName-reg_add")
    person_elem.appendChild(btn_persName)

    label_biodata = doc_xml.createElement("ae-form:label")
    label_biodata.setAttribute("type", "biodata")
    label_biodata.appendChild(doc_xml.createTextNode("Biographical data:"))
    person_elem.appendChild(label_biodata)

    btn_biodata = doc_xml.createElement("ae-form:button")
    btn_biodata.setAttribute("type", "biodata_add")
    person_elem.appendChild(btn_biodata)

    label_idno = doc_xml.createElement("ae-form:label")
    label_idno.setAttribute("type", "idno")
    label_idno.appendChild(doc_xml.createTextNode("Authority references:"))
    person_elem.appendChild(label_idno)

    authority = person.get("authority", {})

    gnd_url = authority.get("gnd")
    if gnd_url:
        idno_gnd = doc_xml.createElement("idno")
        idno_gnd.setAttribute("type", "gnd")
        idno_gnd.appendChild(doc_xml.createTextNode(gnd_url))
        person_elem.appendChild(idno_gnd)

    viaf_url = authority.get("viaf")
    if viaf_url:
        idno_viaf = doc_xml.createElement("idno")
        idno_viaf.setAttribute("type", "viaf")
        idno_viaf.appendChild(doc_xml.createTextNode(viaf_url))
        person_elem.appendChild(idno_viaf)

    lccn_url = authority.get("lccn")
    if lccn_url:
        idno_lccn = doc_xml.createElement("idno")
        idno_lccn.setAttribute("type", "lccn")
        idno_lccn.appendChild(doc_xml.createTextNode(lccn_url))
        person_elem.appendChild(idno_lccn)

    btn_idno = doc_xml.createElement("ae-form:button")
    btn_idno.setAttribute("type", "idno_add")
    person_elem.appendChild(btn_idno)

    status_elem = doc_xml.createElement("ae-index:status")
    status_elem.setAttribute("type", status)
    person_elem.appendChild(status_elem)

    return person_elem


def main():
    url = "https://hannah-arendt-edition.net/~api/index/persons/"
    response = requests.get(url)
    response.raise_for_status()

    data = response.json()
    persons = data.get("persons", {}).get("person", [])

    doc_xml = Document()
    root = doc_xml.createElement("persons")
    doc_xml.appendChild(root)

    for person in persons:
        person_elem = create_person_xml(doc_xml, person)
        root.appendChild(person_elem)

    xml_str = doc_xml.toprettyxml(indent="  ")
    print(xml_str)

    # Se vuoi salvare in file:
    with open("persons.xml", "w", encoding="utf-8") as f:
        f.write(xml_str)


if __name__ == "__main__":
    main()

<?xml version="1.0" ?>
<persons>
  <person ae:label="Hensel, Wilhelm" ae:author="MCT" xml:id="ae6668153" ae:vol="02" role="default">
    <ae-form:label type="name-reg">Name (regular):</ae-form:label>
    <persName ae:label="Hensel, Wilhelm" type="reg">Hensel, Wilhelm</persName>
    <ae-form:button type="persName-reg_add"/>
    <ae-form:label type="biodata">Biographical data:</ae-form:label>
    <ae-form:button type="biodata_add"/>
    <ae-form:label type="idno">Authority references:</ae-form:label>
    <idno type="gnd">https://d-nb.info/gnd/118549316</idno>
    <ae-form:button type="idno_add"/>
    <ae-index:status type="candidate_ed"/>
  </person>
  <person ae:label="Ferrone, John" ae:author="MCT" xml:id="ae0708480" ae:vol="02" role="default">
    <ae-form:label type="name-reg">Name (regular):</ae-form:label>
    <persName ae:label="Ferrone, John" type="reg">Ferrone, John</persName>
    <ae-form:button type="persName-reg_add"/>
    <ae-form:label type="biodata">Biographical data:</ae-