developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_de"

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [5]:
designs = dc.load_designs_from_db("data_designs", [id_col, design_col])

## This step is optional - load additional data to save with the model

In [7]:
entity_information = [dc.load_designs_from_db("nlp_list_person", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_obj_ger", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_animal_ger", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_plant_ger", ["name", "alternativenames","link"])]

In [8]:
optional_info = pd.DataFrame(columns=["name","link"])
for df in entity_information:
    tmp = split_alternativenames(df.fillna(" "))
    optional_info = optional_info.append(tmp)
optional_info

Unnamed: 0,name,link
0,Agrippina minor,http://nomisma.org/id/agrippina_ii
1,Agrippina maior,http://nomisma.org/id/agrippina_i
2,Alexander III,http://nomisma.org/id/alexander_iii
3,Anchialos,http://www.dbis.cs.uni-frankfurt.de/cnt/id/anc...
4,Andromeda,http://www.dbis.cs.uni-frankfurt.de/cnt/id/and...
...,...,...
85,Baumstümpfe,
86,Baumstumpfes,
87,Früchte,
88,Apfelbäume,


In [9]:
optional_info.loc[optional_info["name"]=="Andromeda"]["link"].item()

'http://www.dbis.cs.uni-frankfurt.de/cnt/id/andromeda_person'

### Load and annotate designs

In [10]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj_ger", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal_ger", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant_ger", ["name", "alternativenames"], ["alternativenames"], ",", True)}

In [11]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [12]:
annotated_designs.shape

(6895, 3)

In [13]:
annotated_designs.head(5)

Unnamed: 0,design_de,id,annotations
0,Kopf des vergöttlichten Alexander des Großen n...,1,"[(0, 4, OBJECT), (24, 44, PERSON), (61, 67, OB..."
1,Flammender und bekränzter Altar.,6,"[(26, 31, OBJECT)]"
3,Amphora mit gerippter Bauchoberfläche und gebo...,9,"[(0, 7, OBJECT), (72, 77, PLANT), (82, 90, PLA..."
4,Brustbild des jugendlichen Anchialos nach rech...,10,"[(0, 9, OBJECT), (27, 36, PERSON), (53, 59, OB..."
5,Umgekehrter Anker; unter der linken Ankerschau...,11,"[(12, 17, OBJECT), (51, 56, ANIMAL)]"


## Train NER

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [15]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [16]:
output_dir =  "../cnt/trained_model/ner/german/"
model_name = "german_cno"

In [17]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col, spacy_model="de_core_news_sm", save_optional=True, optional_info=optional_info)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

../cnt/trained_model/ner/german
Saved model to ../cnt/trained_model/ner/german


## Load and evaluate model

In [18]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [19]:
x_predict = model.predict(X_test,as_doc=False)

In [20]:
x_predict

Unnamed: 0,id,y
0,6352,"[(0, 6, OBJECT)]"
1,2026,"[(0, 9, OBJECT), (14, 20, PERSON), (37, 43, OB..."
2,4514,"[(0, 10, ANIMAL)]"
3,5710,"[(0, 7, PERSON), (12, 20, OBJECT), (34, 40, OB..."
4,4981,"[(0, 7, PERSON), (23, 29, OBJECT), (53, 59, OB..."
...,...,...
1719,4512,"[(0, 6, PERSON), (23, 28, OBJECT), (33, 39, OB..."
1720,8257,"[(0, 5, OBJECT)]"
1721,1217,"[(0, 4, PERSON), (28, 34, OBJECT), (92, 97, AN..."
1722,5736,"[(0, 4, OBJECT), (18, 27, PERSON)]"


In [21]:
metrics = Metrics()

In [22]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

Unnamed: 0,Unnamed: 1,Total(TP+FN),Hits(TP),Wrongs(FP),%
0,PERSON,1493,1466,27,98.2
1,OBJECT,3997,3979,29,99.5
2,ANIMAL,583,575,5,98.6
3,PLANT,253,248,9,98.0


In [23]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [24]:
F1 = (2*precision*recall) / (precision + recall)

In [25]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 98.9
Recall 99.08
F1 98.99


# Visualize prediction

In [None]:
x_predict_as_doc = model.predict(designs, as_doc=True)

In [None]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y, 
                style='ent', jupyter=True, options=options)