developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import os
import sys
sys.path.append('../')
import pandas as pd
import random
import numpy as np
import spacy
import swifter
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
#from cnt.stem_lemma_annotation import Stem_Lemma_Annotatizer
from cnt.preprocess import Preprocess
from cnt.io import  Database_Connection



import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"
use_lemma_stem = False

In [3]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [4]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [6]:
designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col])

In [7]:
designs.head(5)

Unnamed: 0,id,design_en
0,1,Diademed head of deified Alexander the Great w...
1,6,"Altar, lighted and garlanded."
2,8,Prize amphora on ornamental stand; within line...
3,9,Amphora with ribbed surface and crooked handle...
4,10,"Bust of youthful Anchialos, right, wearing tae..."


In [8]:
designs.shape

(22112, 2)

## Load and annotate designs

In [9]:
entities = {
    "PERSON": dc.load_entities_from_db_v2("thrakien_d4n4.nlp_list_entities", "PERSON", add_columns, [add_columns[1]], ",", True),
    "OBJECT": dc.load_entities_from_db_v2("thrakien_d4n4.nlp_list_entities", "OBJECT", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db_v2("thrakien_d4n4.nlp_list_entities", "ANIMAL", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db_v2("thrakien_d4n4.nlp_list_entities", "PLANT", add_columns, [add_columns[1]], ",", True)}

In [10]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [11]:
if use_lemma_stem:
    annotater = Stem_Lemma_Annotatizer() # parameter: method="lemma_stem", language="en", backbone="spacy_snowball"
    annotated_designs = annotater.annotate(annotated_designs, entities, id_col, design_col)

In [12]:
annotated_designs.head(5)

Unnamed: 0,design_en,id,annotations
0,Diademed head of deified Alexander the Great w...,1,"[(9, 13, OBJECT), (25, 44, PERSON), (50, 54, O..."
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT)]"
2,Prize amphora on ornamental stand; within line...,8,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handle...,9,"[(0, 7, OBJECT), (40, 47, OBJECT), (63, 67, PL..."
4,"Bust of youthful Anchialos, right, wearing tae...",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB..."


In [13]:
annotated_designs.shape

(21887, 3)

## Preprocessing

In [14]:
annotated_designs["design_en_changed"] = ""

In [15]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [16]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [17]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [18]:
for index, row in annotated_designs.iterrows():
    if " I." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

In [19]:
# Apply defined rules
annotated_designs["design_en_changed"] = annotated_designs.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.id)[0], axis=1)

Pandas Apply: 100%|███████████████████████| 21887/21887 [07:10<00:00, 50.79it/s]


In [20]:
# Deleting brackets and questionmarks
annotated_designs["design_en_changed"] = annotated_designs.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

Pandas Apply: 100%|███████████████████| 21887/21887 [00:00<00:00, 202286.68it/s]


In [21]:
annotated_designs.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "annotations":"annotations_orig"}, inplace=True)

In [22]:
annotated_designs.head(5).style

Unnamed: 0,design_en_orig,id,annotations_orig,design_en
0,"Diademed head of deified Alexander the Great with horn of Ammon, right. Border of dots.",1,"[(9, 13, 'OBJECT'), (25, 44, 'PERSON'), (50, 54, 'OBJECT')]","Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots."
1,"Altar, lighted and garlanded.",6,"[(0, 5, 'OBJECT')]","Altar, lighted and garland."
2,Prize amphora on ornamental stand; within linear square and incuse square.,8,"[(6, 13, 'OBJECT')]",Prize amphora on ornamental stand; within linear square and incuse square.
3,Amphora with ribbed surface and crooked handles containing two ears of corn and poppy.,9,"[(0, 7, 'OBJECT'), (40, 47, 'OBJECT'), (63, 67, 'PLANT'), (71, 75, 'PLANT'), (80, 85, 'PLANT')]",Amphora with ribbed surface and crooked handleholding two corn and poppy.
4,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.",10,"[(0, 4, 'OBJECT'), (17, 26, 'PERSON'), (43, 49, 'OBJECT')]","Bust of youthful Anchialos, right, wearing taenia. Border of dots."


In [23]:
train_designs = annotate_designs(entities, annotated_designs[["id", "design_en"]], id_col, design_col)
train_designs = train_designs[
    train_designs.annotations.map(len) > 0]
annotated_designs = annotated_designs.merge(train_designs[["id", "annotations"]], left_on="id", right_on="id")

In [24]:
annotated_designs.head(5)

Unnamed: 0,design_en_orig,id,annotations_orig,design_en,annotations
0,Diademed head of deified Alexander the Great w...,1,"[(9, 13, OBJECT), (25, 44, PERSON), (50, 54, O...",Diadem head of deified Alexander the Great wit...,"[(0, 6, OBJECT), (7, 11, OBJECT), (23, 42, PER..."
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT)]","Altar, lighted and garland.","[(0, 5, OBJECT), (19, 26, OBJECT)]"
2,Prize amphora on ornamental stand; within line...,8,"[(6, 13, OBJECT)]",Prize amphora on ornamental stand; within line...,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handle...,9,"[(0, 7, OBJECT), (40, 47, OBJECT), (63, 67, PL...",Amphora with ribbed surface and crooked handle...,"[(0, 7, OBJECT), (58, 62, PLANT), (67, 72, PLA..."
4,"Bust of youthful Anchialos, right, wearing tae...",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB...","Bust of youthful Anchialos, right, wearing tae...","[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB..."


## Train NER

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [26]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [27]:
output_dir =  "../cnt/trained_model/ner/english/"
model_name = "english_cno"

In [28]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

../cnt/trained_model/ner/english
Saved model to ../cnt/trained_model/ner/english


## Load and evaluate model

In [29]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [30]:
x_predict = model.predict(X_test,as_doc=False)

In [31]:
x_predict

Unnamed: 0,id,y
0,16132,"[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB..."
1,2256,"[(0, 5, ANIMAL)]"
2,22295,"[(30, 36, OBJECT), (64, 68, PLANT)]"
3,498,"[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB..."
4,23685,"[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB..."
...,...,...
5466,19811,"[(0, 3, PERSON), (27, 35, OBJECT), (45, 49, OB..."
5467,5247,"[(0, 9, OBJECT), (21, 25, OBJECT)]"
5468,16861,"[(0, 4, OBJECT), (8, 16, PERSON), (18, 21, PLA..."
5469,21935,"[(0, 3, PERSON), (5, 12, OBJECT), (31, 36, OBJ..."


In [32]:
metrics = Metrics()

In [33]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

Unnamed: 0,Unnamed: 1,Total(TP+FN),Hits(TP),Wrongs(FP),%
0,PERSON,5995,5973,36,99.6
1,OBJECT,22332,22319,66,99.9
2,ANIMAL,1256,1246,8,99.2
3,PLANT,953,945,9,99.2


In [34]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [35]:
F1 = (2*precision*recall) / (precision + recall)

In [36]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 99.61
Recall 99.83
F1 99.72


## Entity scores

In [37]:
y_test.head(5)

Unnamed: 0,id,y
0,16132,"[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB..."
1,2256,"[(0, 5, ANIMAL)]"
2,22295,"[(30, 36, OBJECT), (64, 68, PLANT)]"
3,498,"[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB..."
4,23685,"[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB..."


In [38]:
x_predict.head(5)

Unnamed: 0,id,y
0,16132,"[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB..."
1,2256,"[(0, 5, ANIMAL)]"
2,22295,"[(30, 36, OBJECT), (64, 68, PLANT)]"
3,498,"[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB..."
4,23685,"[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB..."


In [39]:
X_test["annotation"] = y_test["y"]
X_test["prediction"] = x_predict["y"]
X_test.head(2)

Unnamed: 0,id,design_en,annotation,prediction
0,16132,"Fortuna, standing left, holding patera in righ...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB..."
1,2256,"Horse prancing right; below, monogram.","[(0, 5, ANIMAL)]","[(0, 5, ANIMAL)]"


In [40]:
def get_text(design, ent_list):
    result = []
    for i in ent_list:
        result.append(design[i[0]:i[1]])
    return result

In [41]:
X_test["annotation_str"] = X_test.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)
X_test["prediction_str"] = X_test.apply(lambda row: get_text(row.design_en, row.prediction), axis=1)

In [42]:
X_test.head(2)

Unnamed: 0,id,design_en,annotation,prediction,annotation_str,prediction_str
0,16132,"Fortuna, standing left, holding patera in righ...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB...","[Fortuna, patera, hand, cornucopia]","[Fortuna, patera, hand, cornucopia]"
1,2256,"Horse prancing right; below, monogram.","[(0, 5, ANIMAL)]","[(0, 5, ANIMAL)]",[Horse],[Horse]


In [60]:
X_train["annotation"] = y_train["annotations"]

In [61]:
X_train["annotation_str"] = X_train.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)

In [81]:
labels = {}
for index, row in X_test.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

for index, row in X_train.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

In [82]:
for index, row in X_test.iterrows():
    annot = row.annotation_str
    pred = row.prediction_str
    
    for i in annot:
        labels[i][0] += 1
        if i in pred:
            labels[i][1] += 1

In [83]:
for index, row in X_train.iterrows():
    annot = row.annotation_str
    
    for i in annot:
        labels[i][2] += 1


In [84]:
label_scores = pd.DataFrame().from_dict(labels, orient="index").rename(columns={0:"Annotation", 1:"Prediction", 2:"Total_in_train"})

In [85]:
label_scores["Accuracy"] = label_scores.apply(lambda row: row.Prediction/row.Annotation, axis=1)

In [86]:
label_scores.loc[label_scores.index.str.contains("Alexander")]

Unnamed: 0,Annotation,Prediction,Total_in_train,Accuracy
Severus Alexander,43,43,115,1.0
Alexander the Great,2,2,3,1.0
Domitius Alexander,0,0,4,


In [87]:
label_scores.sort_values("Accuracy").head(10)

Unnamed: 0,Annotation,Prediction,Total_in_train,Accuracy
laurel tree,2,0,1,0.0
Bona Mens,1,0,0,0.0
Minotaur,1,0,0,0.0
Tempus,1,0,0,0.0
Augean Stables,1,0,0,0.0
Euphratides,2,0,0,0.0
Stoa,1,0,0,0.0
visor,1,0,0,0.0
scale,1,0,0,0.0
Rhoimetalces III,1,0,0,0.0


## Map result back

In [50]:
X_test["y"] = y_test["y"]

In [51]:
X_test

Unnamed: 0,id,design_en,annotation,prediction,annotation_str,prediction_str,y
0,16132,"Fortuna, standing left, holding patera in righ...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB...","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB...","[Fortuna, patera, hand, cornucopia]","[Fortuna, patera, hand, cornucopia]","[(0, 7, PERSON), (32, 38, OBJECT), (48, 52, OB..."
1,2256,"Horse prancing right; below, monogram.","[(0, 5, ANIMAL)]","[(0, 5, ANIMAL)]",[Horse],[Horse],"[(0, 5, ANIMAL)]"
2,22295,"Tellus standing left, holding plough and hoe o...","[(30, 36, OBJECT), (64, 68, PLANT)]","[(30, 36, OBJECT), (64, 68, PLANT)]","[plough, corn]","[plough, corn]","[(30, 36, OBJECT), (64, 68, PLANT)]"
3,498,"Galley sailing left, on board, three rowers in...","[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB...","[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB...","[Galley, board, standard, prow, standard]","[Galley, board, standard, prow, standard]","[(0, 6, OBJECT), (24, 29, OBJECT), (60, 68, OB..."
4,23685,"Victoria, draped, standing right on prow, hold...","[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB...","[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB...","[Victoria, draped, prow, wreath, hand, palm]","[Victoria, draped, prow, wreath, hand, palm]","[(0, 8, PERSON), (10, 16, OBJECT), (36, 40, OB..."
...,...,...,...,...,...,...,...
5466,19811,"Pax standing left, holding caduceus in right h...","[(0, 3, PERSON), (27, 35, OBJECT), (45, 49, OB...","[(0, 3, PERSON), (27, 35, OBJECT), (45, 49, OB...","[Pax, caduceus, hand, corn, ear, poppy]","[Pax, caduceus, hand, corn, ear, poppy]","[(0, 3, PERSON), (27, 35, OBJECT), (45, 49, OB..."
5467,5247,Capricorn left. With head turned back.,"[(0, 9, OBJECT), (21, 25, OBJECT)]","[(0, 9, OBJECT), (21, 25, OBJECT)]","[Capricorn, head]","[Capricorn, head]","[(0, 9, OBJECT), (21, 25, OBJECT)]"
5468,16861,"Head of Augustus, oak-wreath, right","[(0, 4, OBJECT), (8, 16, PERSON), (18, 21, PLA...","[(0, 4, OBJECT), (8, 16, PERSON), (18, 21, PLA...","[Head, Augustus, oak, wreath]","[Head, Augustus, oak, wreath]","[(0, 4, OBJECT), (8, 16, PERSON), (18, 21, PLA..."
5469,21935,"Sol, radiate, naked except for cloak over left...","[(0, 3, PERSON), (5, 12, OBJECT), (31, 36, OBJ...","[(0, 3, PERSON), (5, 12, OBJECT), (31, 36, OBJ...","[Sol, radiate, cloak, shoulder, hand, whip, ha...","[Sol, radiate, cloak, shoulder, hand, whip, ha...","[(0, 3, PERSON), (5, 12, OBJECT), (31, 36, OBJ..."


In [52]:
X_test["design_en_orig"] = X_test.swifter.apply(lambda row: preprocess.map_back_design(row.design_en, row.id) if row.id in preprocess.rules_applied else row.design_en, axis=1)

Pandas Apply: 100%|██████████████████████| 5471/5471 [00:00<00:00, 51503.30it/s]


In [53]:
X_test["y_orig"] = X_test.swifter.apply(lambda row: preprocess.map_result_ner(row.design_en,row.y, row.id) if row.id in preprocess.rules_applied else row.y, axis=1)

Pandas Apply: 100%|███████████████████████| 5471/5471 [00:01<00:00, 5402.77it/s]


In [54]:
X_test.loc[X_test.design_en.str.contains("Alexander III")]

Unnamed: 0,id,design_en,annotation,prediction,annotation_str,prediction_str,y,design_en_orig,y_orig


In [55]:
X_test.loc[X_test.design_en.str.contains("Veil")].head(5)

Unnamed: 0,id,design_en,annotation,prediction,annotation_str,prediction_str,y,design_en_orig,y_orig
29,355,"Veil Demeter standing facing, head left, weari...","[(0, 4, OBJECT), (5, 12, PERSON), (30, 34, OBJ...","[(0, 4, OBJECT), (5, 12, PERSON), (30, 34, OBJ...","[Veil, Demeter, head, corn wreath, garment, co...","[Veil, Demeter, head, corn wreath, garment, co...","[(0, 4, OBJECT), (5, 12, PERSON), (30, 34, OBJ...","Veiled Demeter standing facing, head left, wea...","[(0, 6, OBJECT), (7, 14, PERSON), (32, 36, OBJ..."
205,7893,"Veil Hera standing facing, head left, wearing ...","[(0, 4, OBJECT), (5, 9, PERSON), (27, 31, OBJE...","[(0, 4, OBJECT), (5, 9, PERSON), (27, 31, OBJE...","[Veil, Hera, head, garment, patera, scepter, g...","[Veil, Hera, head, garment, patera, scepter, g...","[(0, 4, OBJECT), (5, 9, PERSON), (27, 31, OBJE...","Veiled Hera standing facing, head left, wearin...","[(0, 6, OBJECT), (7, 11, PERSON), (29, 33, OBJ..."
651,492,"Veil woman goddess standing facing, head left,...","[(0, 4, OBJECT), (5, 10, PERSON), (36, 40, OBJ...","[(0, 4, OBJECT), (5, 10, PERSON), (36, 40, OBJ...","[Veil, woman, head, object, hand, scepter, hand]","[Veil, woman, head, object, hand, scepter, hand]","[(0, 4, OBJECT), (5, 10, PERSON), (36, 40, OBJ...","Veiled female goddess standing facing, head le...","[(0, 6, OBJECT), (7, 13, PERSON), (39, 43, OBJ..."
819,7056,"Veil Hera standing left, wearing long garment,...","[(0, 4, OBJECT), (5, 9, PERSON), (38, 45, OBJE...","[(0, 4, OBJECT), (5, 9, PERSON), (38, 45, OBJE...","[Veil, Hera, garment, patera, scepter]","[Veil, Hera, garment, patera, scepter]","[(0, 4, OBJECT), (5, 9, PERSON), (38, 45, OBJE...","Veiled Hera standing left, wearing long garmen...","[(0, 6, OBJECT), (7, 11, PERSON), (40, 47, OBJ..."
829,7897,"Veil Demeter throne left, holding corn and lon...","[(0, 4, OBJECT), (5, 12, PERSON), (13, 19, OBJ...","[(0, 4, OBJECT), (5, 12, PERSON), (13, 19, OBJ...","[Veil, Demeter, throne, corn, torch, basket, s...","[Veil, Demeter, throne, corn, torch, basket, s...","[(0, 4, OBJECT), (5, 12, PERSON), (13, 19, OBJ...","Veiled Demeter enthroned left, holding ears of...","[(0, 6, OBJECT), (7, 14, PERSON), (15, 24, OBJ..."


# Visualize prediction

In [56]:
x_predict_as_doc = model.predict(X_test, as_doc=True)

In [None]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y, 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [None]:
upload = False

In [None]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)