developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
from cnt.stem_lemma_annotation import Stem_Lemma_Annotatizer
from cnt.preprocess import Preprocess
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /Users/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"
use_lemma_stem = False

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [5]:
designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col])

In [6]:
designs.head(5)

Unnamed: 0,id,design_en
0,1,"Diademed head of deified Alexander the Great with horn of Ammon, right. Border of dots."
1,6,"Altar, lighted and garlanded."
2,8,Prize amphora on ornamental stand; within linear square and incuse square.
3,9,Amphora with ribbed surface and crooked handles containing two ears of corn and poppy.
4,10,"Bust of youthful Anchialos, right, wearing taenia. Border of dots."


## This step is optional - load additional data to save with the model

## Load and annotate designs

In [7]:
language = "_en"
add_columns = ["name"+language, "alternativenames"+language]

In [8]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [9]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [10]:
if use_lemma_stem:
    annotater = Stem_Lemma_Annotatizer() # parameter: method="lemma_stem", language="en", backbone="spacy_snowball"
    annotated_designs = annotater.annotate(annotated_designs, entities, id_col, design_col)

In [11]:
annotated_designs.head(5)

Unnamed: 0,design_en,id,annotations
0,"Diademed head of deified Alexander the Great with horn of Ammon, right. Border of dots.",1,"[(9, 13, OBJECT), (25, 44, PERSON)]"
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT)]"
2,Prize amphora on ornamental stand; within linear square and incuse square.,8,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handles containing two ears of corn and poppy.,9,"[(0, 7, OBJECT), (63, 75, PLANT), (80, 85, PLANT)]"
4,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OBJECT)]"


In [12]:
annotated_designs.shape

(21883, 3)

## Add preprocessing rules

In [13]:
preprocess = Preprocess()

In [14]:
#Examples (not used in pipeline)
preprocess.add_rule("horn of Ammon", "Ammonhorn")
preprocess.add_rule("Alexander the Great", "Alexander")

In [15]:
design = "Diademed head of deified Alexander the Great with horn of Ammon, right."

In [16]:
pre_design = preprocess.preprocess_design(design,1)
pre_design

'Diademed head of deified Alexander with Ammonhorn, right.'

In [17]:
annot = [(25,34,"PERSON"), (40, 49,"OBJECT")]

In [18]:
preprocess.map_back_design(pre_design, 1)

'Diademed head of deified Alexander the Great with horn of Ammon, right.'

In [19]:
preprocess.map_result_ner(pre_design, annot, 1)

[(25, 44, 'PERSON'), (50, 63, 'OBJECT')]

## Train NER

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [21]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [22]:
output_dir =  "../cnt/trained_model/ner/english/"
model_name = "english_cno"

In [23]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

../cnt/trained_model/ner/english
Saved model to ../cnt/trained_model/ner/english


## Load and evaluate model

In [24]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [25]:
x_predict = model.predict(X_test,as_doc=False)

In [26]:
x_predict

Unnamed: 0,id,y
0,12350,"[(0, 4, OBJECT), (25, 32, OBJECT), (52, 56, OBJECT), (77, 84, OBJECT)]"
1,8434,"[(0, 6, OBJECT), (16, 21, ANIMAL)]"
2,2273,"[(0, 4, ANIMAL), (27, 32, OBJECT), (33, 37, OBJECT)]"
3,7548,"[(0, 4, ANIMAL), (24, 28, OBJECT), (56, 61, ANIMAL)]"
4,1988,"[(9, 13, OBJECT), (17, 23, PERSON)]"
...,...,...
5466,4153,"[(5, 13, PERSON), (31, 35, OBJECT), (50, 60, OBJECT), (65, 70, OBJECT), (89, 95, PLANT), (136, 143, OBJECT)]"
5467,4964,"[(12, 16, OBJECT), (36, 51, PERSON)]"
5468,2021,"[(0, 4, ANIMAL)]"
5469,24538,"[(7, 18, OBJECT)]"


In [27]:
metrics = Metrics()

In [28]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

Unnamed: 0,Unnamed: 1,Total(TP+FN),Hits(TP),Wrongs(FP),%
0,PERSON,5919,5891,67,99.5
1,OBJECT,12710,12681,52,99.8
2,ANIMAL,1281,1267,8,98.9
3,PLANT,980,971,11,99.1


In [29]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [30]:
F1 = (2*precision*recall) / (precision + recall)

In [31]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 99.34
Recall 99.62
F1 99.48


# Visualize prediction

In [32]:
x_predict_as_doc = model.predict(designs, as_doc=True)

In [33]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y[:10], 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [34]:
upload = False

In [35]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)