developed by Patricia Klinger, modified by Sebastian Gampe

In [1]:
import pandas as pd
import random
import os
from cnt.model import DesignEstimator
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation)
from cnt.io import (load_entities_from_file, load_entities_from_db,
                    load_ocre_designs, load_ocre_designs_obv, load_designs)
from cnt.train_test import train_test_annotate
from cnt.extract_relation import path
from cnt.evaluate import score_precision_recall, score_accuracy
from spacy import displacy

In [2]:
# For all OCRE designs (obv + rev) use load_ocre_designs, for obverses only use load_ocre_designs_obv 
# You need to fill in your own database credentials in the "io.py" file in the "cnt" folder. 
designs = load_ocre_designs()
#designs = load_ocre_designs_obv()
english_designs = designs[:5000]
english_designs.head()

Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right"
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right"
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right"
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right"
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right"


In [3]:
# create dictionary entities: key = label, value = entities
# You need to fill in your own database credentials below.
mysql_connection = "mysql://user:password@localhost:3306/database_name"
entities = {
    "PERSON": load_entities_from_db("nlp_list_person", mysql_connection),
    "OBJECT": load_entities_from_db("nlp_list_obj", mysql_connection),
    "ANIMAL": load_entities_from_db("nlp_list_animal", mysql_connection),
    "PLANT": load_entities_from_db("nlp_list_plant", mysql_connection)
}
entities["PLANT"][:5]

  cursor.execute('SELECT @@tx_isolation')


['apple', 'barley', 'berry', 'branch', 'cloverleaf']

In [4]:
annotated_designs = annotate_designs(entities, english_designs)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]
annotated_designs.head()

Unnamed: 0,DesignEng,DesignID,annotations
0,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.868#o...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
1,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1206A...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
2,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1206B...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
3,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1208A...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
4,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1208B...,"[(0, 4, OBJECT), (8, 22, PERSON)]"


In [5]:
# sklearn train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[["DesignID", "DesignEng"]],
                                                    annotated_designs[["DesignID", "annotations"]], 
                                                    test_size=0.25)

In [6]:
n_rep = 3
my_estimator = DesignEstimator(n_rep)
my_estimator.fit(X_train, y_train.annotations, "ocre")
train_score = score_accuracy(y_train.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_train))
test_score = score_accuracy(y_test.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_test))
res = {"n_rep": n_rep, "train_score": train_score,
       "test_score": test_score}

In [7]:
res

{'n_rep': 3,
 'test_score': 0.9599679743795037,
 'train_score': 0.9741057127602776}

In [8]:
precision, recall = score_precision_recall(y_test.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_test))
res.update({"precision":precision, "recall":recall, "split": "random"})

In [9]:
precision

0.9925925925925926

In [10]:
recall

0.9813581890812251

In [11]:
y_pred = my_estimator.predict(X_test)

In [12]:
y_pred.head()

Unnamed: 0,DesignID,y
1328,http://numismatics.org/ocre/id/ric.2.hdn.947c#...,"[(0, 4, OBJECT), (8, 15, PERSON)]"
3075,http://numismatics.org/ocre/id/ric.3.ant.1195_...,"[(10, 16, OBJECT), (23, 30, OBJECT)]"
3210,http://numismatics.org/ocre/id/ric.3.ant.479D#...,"[(0, 4, OBJECT), (8, 23, PERSON)]"
3311,http://numismatics.org/ocre/id/ric.3.ant.635b#...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
464,http://numismatics.org/ocre/id/ric.3.ant.680#o...,"[(0, 4, OBJECT), (8, 22, PERSON)]"


In [11]:
# load designs again from OCRE database and use the trained model on them
# For all OCRE designs (obv + rev) use load_ocre_designs, for obverses only use load_ocre_designs_obv 
#ocre_designs = load_ocre_designs()
ocre_designs = load_ocre_designs_obv()
ocre_designs.head()

ocre_pred = my_estimator.predict_clear(ocre_designs)


  cursor.execute('SELECT @@tx_isolation')


In [12]:
ocre_designs.head()

Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.2_1(2).dom....,"Bust of Domitian, laureate, right with aegis"
1,http://numismatics.org/ocre/id/ric.4.gor_iii.1...,"Bust of Gordian III, laureate, draped, cuiras..."
2,http://numismatics.org/ocre/id/ric.4.ss.309_au...,"Head of Septimius Severus, laureate, right"
3,http://numismatics.org/ocre/id/ric.10.zeno(2)_...,"Bust of Zeno, helmeted, pearl-diademed withou..."
4,http://numismatics.org/ocre/id/ric.6.alex.106#...,"Head of Constantine I, laureate, right; ties ..."


In [23]:
ocre_pred_predictions_only = ocre_pred["y"]
ocre_prediction_output = pd.DataFrame({"DesignID" : ocre_designs["DesignID"],
                                      "X_test" : ocre_designs["DesignEng"], 
                                  "y_predict" : ocre_pred_predictions_only})
ocre_prediction_output.head()
#for val in ocre_prediction_output.get('y_predict'):
#    for ding in val:
#        if ding == (' ', 'PERSON'):
#            print(val)

#ocre_prediction_output["y_predict"]

Unnamed: 0,DesignID,X_test,y_predict
0,http://numismatics.org/ocre/id/ric.2_1(2).dom...,"Bust of Domitian, laureate, right with aegis","[(Bust, OBJECT), (Domitian, PERSON), (aegis, O..."
1,http://numismatics.org/ocre/id/ric.4.gor_iii....,"Bust of Gordian III, laureate, draped, cuiras...","[(Bust, OBJECT), (Gordian III, PERSON)]"
2,http://numismatics.org/ocre/id/ric.4.ss.309_a...,"Head of Septimius Severus, laureate, right","[(Head, OBJECT), (Septimius Severus, PERSON)]"
3,http://numismatics.org/ocre/id/ric.10.zeno(2)...,"Bust of Zeno, helmeted, pearl-diademed withou...","[(Bust, OBJECT), (Zeno, PERSON), (spear, OBJEC..."
4,http://numismatics.org/ocre/id/ric.6.alex.106...,"Head of Constantine I, laureate, right; ties ...","[(Head, OBJECT), (truncation, OBJECT)]"


In [13]:
ocre_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in ocre_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

ocre_ner_output.to_sql("ocre_pipeline_ner_obv", 
                           "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt", 
                           if_exists="replace", index=False)

  cursor.execute('SELECT @@tx_isolation')
