developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, annotate_designs_german, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

In [2]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [3]:
designs = dc.load_designs_from_db("designs", ["DesignID", "DesignEng"])

## This step is optional - load additional data to save with the model

In [4]:
entity_information = [dc.load_designs_from_db("nlp_list_person", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_obj", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_animal", ["name", "alternativenames","link"]),
                      dc.load_designs_from_db("nlp_list_plant", ["name", "alternativenames","link"])]

In [5]:
optional_info = pd.DataFrame(columns=["name","link"])
for df in entity_information:
    tmp = split_alternativenames(df.fillna(" "))
    optional_info = optional_info.append(tmp)
optional_info

Unnamed: 0,name,link
0,Agrippina minor,http://nomisma.org/id/agrippina_ii
1,Agrippina maior,http://nomisma.org/id/agrippina_i
2,Alexander III,http://nomisma.org/id/alexander_iii
3,Anchialos,http://www.dbis.cs.uni-frankfurt.de/cnt/id/anc...
4,Andromeda,http://www.dbis.cs.uni-frankfurt.de/cnt/id/and...
...,...,...
55,starflowers,http://www.dbis.cs.uni-frankfurt.de/cnt/id/sta...
56,grain ears,
57,wheat ears,
58,ears of corn,


In [6]:
optional_info.loc[optional_info["name"]=="Andromeda"]["link"].item()

'http://www.dbis.cs.uni-frankfurt.de/cnt/id/andromeda_person'

### Load and annotate designs

In [7]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", ["name", "alternativenames"], ["alternativenames"], ",", True)}

In [8]:
annotated_designs = annotate_designs(entities, designs, "DesignEng")
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [9]:
annotated_designs.shape

(5473, 3)

In [10]:
annotated_designs.head(5)

Unnamed: 0,DesignEng,DesignID,annotations
0,Diademed head of deified Alexander the Great w...,1,"[(9, 13, OBJECT), (25, 44, PERSON)]"
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT)]"
2,Prize amphora on ornamental stand; within line...,8,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handle...,9,"[(0, 7, OBJECT), (63, 75, PLANT), (80, 85, PLA..."
4,"Bust of youthful Anchialos, right, wearing tae...",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB..."


## Train NER

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[["DesignID", "DesignEng"]],
                                                    annotated_designs[["DesignID", "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [12]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [13]:
output_dir =  "../cnt/trained_model/ner/english/"
model_name = "english_cno"

In [14]:
my_estimator = DesignEstimator(4, output_dir, model_name, "DesignEng", save_optional=True, optional_info=optional_info)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

..\cnt\trained_model\ner\english
Saved model to ..\cnt\trained_model\ner\english


## Load and evaluate model

In [15]:
model = load_ner_model_v2(output_dir, model_name, "DesignEng")

In [16]:
x_predict = model.predict(X_test,as_doc=False)

In [17]:
x_predict

Unnamed: 0,DesignID,y
0,3429,"[(0, 6, PERSON), (35, 42, OBJECT), (47, 55, OB..."
1,5867,"[(8, 17, PERSON), (19, 25, PERSON), (51, 55, A..."
2,1841,"[(12, 24, OBJECT), (46, 50, PLANT), (71, 79, A..."
3,5309,"[(10, 15, PERSON), (16, 20, OBJECT), (35, 41, ..."
4,108,"[(20, 24, OBJECT), (28, 35, PERSON), (44, 47, ..."
...,...,...
1364,1836,"[(14, 22, PERSON), (47, 56, OBJECT), (142, 148..."
1365,4260,"[(0, 4, ANIMAL), (6, 12, PERSON), (56, 60, PLA..."
1366,4846,"[(5, 13, PERSON), (37, 42, OBJECT), (47, 52, O..."
1367,3695,"[(0, 9, OBJECT), (25, 31, OBJECT), (49, 55, PE..."


In [18]:
metrics = Metrics()

In [19]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

Unnamed: 0,Unnamed: 1,Total(TP+FN),Hits(TP),Wrongs(FP),%
0,PERSON,1354,1338,12,98.8
1,OBJECT,3107,3083,33,99.2
2,ANIMAL,421,418,12,99.3
3,PLANT,297,292,8,98.3


In [20]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [21]:
F1 = (2*precision*recall) / (precision + recall)

In [22]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 98.75
Recall 99.07
F1 98.91


# Visualize prediction

In [23]:
x_predict_as_doc = model.predict(designs, as_doc=True)

In [24]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y, 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [25]:
upload = False

In [26]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", ["DesignID", "DesignEng"])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)