developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
#from cnt.stem_lemma_annotation import Stem_Lemma_Annotatizer
from cnt.preprocess import Preprocess
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"
use_lemma_stem = False

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [5]:
designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col, "comment"])

In [6]:
designs

Unnamed: 0,id,design_en,comment
0,1,Diademed head of deified Alexander the Great w...,from CN
1,6,"Altar, lighted and garlanded.",from CN
2,8,Prize amphora on ornamental stand; within line...,from CN
3,9,Amphora with ribbed surface and crooked handle...,from CN
4,10,"Bust of youthful Anchialos, right, wearing tae...",from CN
...,...,...,...
22559,27814,Seleucus I wearing crown juwel.,auto_generated
22560,27815,Athena seated on dolphin.,auto_generated
22561,27816,Rhoemetalces I holding paludamentum.,auto_generated
22562,27817,Equitas seated on bull.,auto_generated


In [7]:
designs.head(5)

Unnamed: 0,id,design_en,comment
0,1,Diademed head of deified Alexander the Great w...,from CN
1,6,"Altar, lighted and garlanded.",from CN
2,8,Prize amphora on ornamental stand; within line...,from CN
3,9,Amphora with ribbed surface and crooked handle...,from CN
4,10,"Bust of youthful Anchialos, right, wearing tae...",from CN


## This step is optional - load additional data to save with the model

## Load and annotate designs

In [8]:
language = "_en"
add_columns = ["name"+language, "alternativenames"+language]

In [9]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [10]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [11]:
if use_lemma_stem:
    annotater = Stem_Lemma_Annotatizer() # parameter: method="lemma_stem", language="en", backbone="spacy_snowball"
    annotated_designs = annotater.annotate(annotated_designs, entities, id_col, design_col)

In [12]:
annotated_designs.head(5)

Unnamed: 0,design_en,id,annotations
0,Diademed head of deified Alexander the Great w...,1,"[(0, 8, OBJECT), (9, 13, OBJECT), (25, 44, PER..."
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT), (19, 28, OBJECT)]"
2,Prize amphora on ornamental stand; within line...,8,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handle...,9,"[(0, 7, OBJECT), (40, 47, OBJECT), (63, 75, PL..."
4,"Bust of youthful Anchialos, right, wearing tae...",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB..."


In [13]:
annotated_designs.shape

(22396, 3)

## Train NER

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [15]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [16]:
output_dir =  "../cnt/trained_model/ner/english/"
model_name = "english_cno"

In [17]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

../cnt/trained_model/ner/english
Saved model to ../cnt/trained_model/ner/english


## Load and evaluate model

In [18]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [19]:
x_predict = model.predict(X_test,as_doc=False)

In [20]:
x_predict

Unnamed: 0,id,y
0,17740,"[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ..."
1,13637,"[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB..."
2,23165,"[(0, 6, OBJECT), (37, 43, OBJECT), (49, 54, OB..."
3,12962,"[(0, 4, OBJECT), (8, 14, PERSON), (16, 23, OBJ..."
4,15046,"[(0, 10, PERSON), (12, 18, OBJECT), (38, 46, O..."
...,...,...
5594,82,"[(5, 11, PERSON), (35, 41, OBJECT), (64, 68, O..."
5595,10838,"[(0, 4, OBJECT), (8, 22, PERSON), (37, 43, OBJ..."
5596,4378,"[(0, 7, PERSON), (9, 26, PERSON), (31, 46, OBJ..."
5597,14639,"[(0, 14, PERSON), (20, 28, PERSON), (30, 38, O..."


In [21]:
metrics = Metrics()

In [22]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

Unnamed: 0,Unnamed: 1,Total(TP+FN),Hits(TP),Wrongs(FP),%
0,PERSON,6122,6097,39,99.6
1,OBJECT,22978,22949,62,99.9
2,ANIMAL,1268,1254,4,98.9
3,PLANT,939,933,9,99.4


In [23]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [24]:
F1 = (2*precision*recall) / (precision + recall)

In [25]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 99.64
Recall 99.76
F1 99.7


## Testset

In [None]:
y_test.head(5)

In [27]:
x_predict.head(5)

Unnamed: 0,id,y
0,17740,"[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ..."
1,13637,"[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB..."
2,23165,"[(0, 6, OBJECT), (37, 43, OBJECT), (49, 54, OB..."
3,12962,"[(0, 4, OBJECT), (8, 14, PERSON), (16, 23, OBJ..."
4,15046,"[(0, 10, PERSON), (12, 18, OBJECT), (38, 46, O..."


In [28]:
X_test["annotation"] = y_test["y"]
X_test["prediction"] = x_predict["y"]
X_test.head(2)

Unnamed: 0,id,design_en,annotation,prediction
0,17740,"Johannes, draped, cuirassed, standing right, h...","[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ...","[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ..."
1,13637,"Caracalla and Geta, both in military attire, s...","[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB...","[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB..."


In [29]:
def get_text(design, ent_list):
    result = []
    for i in ent_list:
        result.append(design[i[0]:i[1]])
    return result

In [30]:
X_test["annotation_str"] = X_test.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)
X_test["prediction_str"] = X_test.apply(lambda row: get_text(row.design_en, row.prediction), axis=1)

In [31]:
X_test.head(2)

Unnamed: 0,id,design_en,annotation,prediction,annotation_str,prediction_str
0,17740,"Johannes, draped, cuirassed, standing right, h...","[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ...","[(10, 16, OBJECT), (18, 27, OBJECT), (53, 61, ...","[draped, cuirassed, standard, hand, Victory, g...","[draped, cuirassed, standard, hand, Victory, g..."
1,13637,"Caracalla and Geta, both in military attire, s...","[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB...","[(0, 9, PERSON), (14, 18, PERSON), (28, 43, OB...","[Caracalla, Geta, military attire, hands, spea...","[Caracalla, Geta, military attire, hands, spea..."


In [32]:
X_train["annotation"] = y_train["annotations"]

In [33]:
X_train["annotation_str"] = X_train.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)

In [34]:
labels = {}
for index, row in X_test.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

for index, row in X_train.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

In [35]:
for index, row in X_test.iterrows():
    annot = row.annotation_str
    pred = row.prediction_str
    
    for i in annot:
        labels[i][0] += 1
        if i in pred:
            labels[i][1] += 1

In [36]:
for index, row in X_train.iterrows():
    annot = row.annotation_str
    
    for i in annot:
        labels[i][2] += 1


In [37]:
label_scores = pd.DataFrame().from_dict(labels, orient="index").rename(columns={0:"Annotation", 1:"Prediction", 2:"Total_in_train"})

In [38]:
label_scores["Accuracy"] = label_scores.apply(lambda row: row.Prediction/row.Annotation, axis=1)

In [39]:
label_scores.loc[label_scores.index.str.contains("Alexander")]

Unnamed: 0,Annotation,Prediction,Total_in_train,Accuracy
Severus Alexander,31,31,130,1.0
Domitius Alexander,1,1,3,1.0
Alexander the Great,1,1,4,1.0


In [40]:
label_scores.sort_values("Accuracy").head(200).style

Unnamed: 0,Annotation,Prediction,Total_in_train,Accuracy
Faustina II,1,0,0,0.0
Seleucus I Nikator,1,0,0,0.0
Consul,1,0,0,0.0
City wall,1,0,0,0.0
aquilae,1,0,0,0.0
barley,1,0,2,0.0
donkey,1,0,0,0.0
rudders,1,0,0,0.0
goats,1,0,0,0.0
octopus,1,0,0,0.0


# Visualize prediction

In [42]:
x_predict_as_doc = model.predict(designs[:20], as_doc=True)

In [43]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y[:10], 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [None]:
upload = False

In [None]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)