## Connect to google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Install SpaCy models and StandoffConverter

In [64]:
!python -m spacy download en_core_web_trf de_core_news_lg > /dev/null
!pip install -q standoffconverter
!pip install -q spacy-transformers



# Tutorial Start

## Import libraries

In [4]:
from lxml import etree
from standoffconverter import Standoff, View

import spacy
import spacy_transformers
ner_dict = {'en': 'en_core_web_trf',
            'de': 'de_core_news_lg'}
import pandas as pd

## Set input and output file paths

In [68]:
xml_path = '/content/IL-MTFN-001-G-F-0353-18.xml'
so_ner_result = '/content/IL-MTFN-001-G-F-0353-18_so_ner_results.xml'

## Load XML-TEI and parse it with Standoff

In [69]:
# parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(xml_path)
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
so = Standoff(tree, namespaces)

In [70]:
so.plain[:1000]

'\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\tDuveen Brothers\n\t\t\t\t\t720 Fith Avenue\n\t\t\t\t\tNew York\n\t\t\t\t\tJanuary 5th, 1928\n\t\t\t\t\tMy dear Kr. Miller:\n\t\t\t\t\tThe bearer of this letter is my friend,\n\t\t\t\t\tMr. Hermann Struck, for whose work as an artist I\n\t\t\t\t\thave the greatest admiration. He will explain the\n\t\t\t\t\treason he wishes to see you, and anything that you\n\t\t\t\t\tmay be able to do for him I shall greatly appreciate.\n\t\t\t\t\tBelieve me\n\t\t\t\t\tYours sincerely,\n\t\t\t\t\tJoseph Duveen\n\t\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\tHy. Miller, Esq.\n\t\t\t\t\tMessrs. Arthur Ackerman & Son,\n\t\t\t\t\t50 East 57th Street,\n\t\t\t\t\tNew York City.\n\t\t\t\t\tJD:HF\n\t\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t52 William Street \n\t\t\t\t\tNew York\n\t\t\t\t\tMarch 8. 1928\n\t\t\t\t\tHermann Struck, Esq.,\n\t\t\t\t\tHotel Majestic,\n\t\t\t\t\t72nd Street and Central Park,\n\t\t\t\t\tNew York, N.Y.\n\t\t\t\t\tDear Mr. Struck:\n\

## Preprocessing

### Clear tabs and newlines

In [71]:
view = View(so).shrink_whitespace()
plain = view.get_plain()
plain[:1000]

create view: 100%|██████████| 1035/1035 [00:00<00:00, 4437.43it/s]
shrink whitespace: 100%|██████████| 7893/7893 [00:00<00:00, 11596.15it/s]


' Duveen Brothers 720 Fith Avenue New York January 5th, 1928 My dear Kr. Miller: The bearer of this letter is my friend, Mr. Hermann Struck, for whose work as an artist I have the greatest admiration. He will explain the reason he wishes to see you, and anything that you may be able to do for him I shall greatly appreciate. Believe me Yours sincerely, Joseph Duveen Hy. Miller, Esq. Messrs. Arthur Ackerman & Son, 50 East 57th Street, New York City. JD:HF 52 William Street New York March 8. 1928 Hermann Struck, Esq., Hotel Majestic, 72nd Street and Central Park, New York, N.Y. Dear Mr. Struck: Thank you very much for sending me the etchings which I think are as pleasing as the subject permitted you to make. There were not nine, but eight and I have marked two of them as first and second choice. Altogether, I think I would like to have about twenty copies for the time being. Should there be an unexpectedly large demand I will ask for some more prints. Meanwhile I thank you very much for t

## NER

### Download english transformer

In [72]:
nlp = spacy.load("en_core_web_trf")

### Process text for NER

In [73]:
doc = nlp(plain)

In [74]:
doc_results = {'entity_name': [entity.text for entity in doc.ents],
               'entity_label': [entity.label_ for entity in doc.ents]}
ner_df = pd.DataFrame(doc_results).set_index('entity_name')

ner_df

Unnamed: 0_level_0,entity_label
entity_name,Unnamed: 1_level_1
Duveen Brothers,ORG
New York,GPE
"January 5th, 1928",DATE
Kr.,PERSON
Miller,PERSON
...,...
Roger van der Weyden,PERSON
Mellon,PERSON
New York,GPE
Mount Carmel,LOC


## Annotation

### NER Inline annotation
#### ISSUE with `add_inline()`:
Error: `ValueError: no unique context found`\
This error occurs when we are trying to add inline tags. The reason for the error is unclear.

Current workaround is to sorround `add_inline` with a `try/except` block.

In [75]:
for i, ent in enumerate(doc.ents):
  start_ind = view.get_table_pos(ent.start_char)
  end_ind = view.get_table_pos(ent.end_char)
  label = ent.label_

  print(f'{i} {start_ind=}\t{end_ind=}\t{label=}')

  try:
    so.add_inline(
        begin=start_ind,
        end=end_ind,
        tag=ent.label_,
        depth=None,
        attrib={}
    )
  except Exception as e:
    print(e)

0 start_ind=22	end_ind=37	label='ORG'
1 start_ind=64	end_ind=72	label='GPE'
2 start_ind=78	end_ind=95	label='DATE'
3 start_ind=109	end_ind=112	label='PERSON'
4 start_ind=113	end_ind=119	label='PERSON'
5 start_ind=175	end_ind=189	label='PERSON'
6 start_ind=434	end_ind=481	label='PERSON'
no unique context found
7 start_ind=501	end_ind=516	label='PERSON'
8 start_ind=529	end_ind=531	label='CARDINAL'
9 start_ind=555	end_ind=568	label='GPE'
10 start_ind=608	end_ind=610	label='CARDINAL'
11 start_ind=632	end_ind=640	label='GPE'
12 start_ind=646	end_ind=653	label='DATE'
13 start_ind=655	end_ind=659	label='DATE'
14 start_ind=665	end_ind=685	label='ORG'
15 start_ind=692	end_ind=706	label='ORG'
16 start_ind=713	end_ind=724	label='FAC'
17 start_ind=729	end_ind=741	label='LOC'
18 start_ind=748	end_ind=756	label='GPE'
19 start_ind=758	end_ind=762	label='GPE'
20 start_ind=777	end_ind=783	label='PERSON'
21 start_ind=931	end_ind=935	label='CARDINAL'
22 start_ind=946	end_ind=951	label='CARDINAL'
23 start

#### Text element output

In [76]:
etree.tostring(so.text_el).decode("utf-8")[:1000]

'<text xmlns="http://www.tei-c.org/ns/1.0">\n\t\t<body>\n\t\t\t<pb facs="#facs_4" xml:id="IL-MTFN-001-G-F-0353-18_0001_0004.jpg" n="4"/>\n\t\t\t<p facs="#facs_4_tr_1">\n\t\t\t\t<lg>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_1"><ORG>Duveen Brothers</ORG></l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_2">720 Fith Avenue</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_3"><GPE>New York</GPE></l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_4"><DATE>January 5th, 1928</DATE></l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_5">My dear <PERSON>Kr.</PERSON> <PERSON>Miller</PERSON>:</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_6">The bearer of this letter is my friend,</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_7">Mr. <PERSON>Hermann Struck</PERSON>, for whose work as an artist I</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_8">have the greatest admiration. He will explain the</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_9">reason he wishes to see you, and anything that you</l>\n\t\t\t\t\t<l facs="#facs_4_tr_1_tl_10">may be able to do for him I shall g

## Export

In [59]:
etree.ElementTree(so.tree.getroot()).write(so_ner_result)