In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
import swifter

from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from cnt.preprocess import Preprocess


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [2]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection")  # Format user:password@IP/Database

In [3]:
# Define all variables
id_col = "id" 
design_col = "design_en"
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language, "class"]
id_col_RE = "design_id"
design_col_RE = "design_en"
ner_model_directory = "../cnt/trained_model/ner/english_new/" # for loading NER
ner_model_name = "english_cno"
re_model_directory = "../cnt/trained_model/re/"  # to save/load RE
re_model_name = "english_cno_new"


In [4]:
# optional - if already defined above
id_col = "id"
design_col = "design_en"
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language, "class"]

In [4]:
# Get the annotated designs (ground truth) from the database
# ... under investigation
train = dc.create_own_query("""select design_id, 
(select design_en from nlp_training_designs as nlp where re.design_id=nlp.id) as design_en,
(select name_en from nlp_list_entities as ner where ner.id=re.subject) as s, 
(select class from nlp_list_entities as ner where ner.id=re.subject) as subject_class, 
(select name_en from nlp_list_entities as ner where ner.id=re.predicate) as p, 
(select name_en from nlp_list_entities as ner where ner.id=re.object) as o, 
(select class from nlp_list_entities as ner where ner.id=re.object) as object_class
from nlp_relation_extraction_en_v2 as re;""")

In [5]:
train.head(5)

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class
0,9,Amphora with ribbed surface and crooked handle...,amphora,OBJECT,holding,poppy,PLANT
1,33,"Half-nude Aphrodite standing facing, head left...",Aphrodite,PERSON,holding,apple,PLANT
2,36,"Nude Aphrodite standing facing, head right, co...",Eros,PERSON,seated_on,dolphin,ANIMAL
3,85,"Nude Apollo standing facing, head left, left l...",Apollo,PERSON,holding,patera,OBJECT
4,85,"Nude Apollo standing facing, head left, left l...",serpent staff,OBJECT,coiling,omphalos,OBJECT


In [6]:
# merging s, p, o to one triple into an array of length 1 for each entry - number of rows stay the same
train["y"] = train.apply(lambda row: [(row.s, row.subject_class, row.p, row.o, row.object_class)], axis=1)

In [7]:
train.head(2).style

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class,y
0,9,Amphora with ribbed surface and crooked handle...,amphora,OBJECT,holding,poppy,PLANT,"[(amphora, OBJECT, holding, poppy, PLANT)]"
1,33,"Half-nude Aphrodite standing facing, head left...",Aphrodite,PERSON,holding,apple,PLANT,"[(Aphrodite, PERSON, holding, apple, PLANT)]"


In [8]:
# needed for the later merge
tmp = train.groupby("design_id").agg({"y": "sum"})

In [9]:
tmp.loc[tmp.index==1706].style

Unnamed: 0_level_0,y
design_id,Unnamed: 1_level_1
1706,"[('Caracalla', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Caracalla', 'PERSON', 'holding', 'scroll', 'OBJECT'), ('Geta', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Geta', 'PERSON', 'holding', 'scroll', 'OBJECT')]"


In [10]:
X = train

In [11]:
# entries for one design are merged --> arrays contain all triples for each design
X = X.merge(tmp, left_on="design_id", right_on="design_id", suffixes=('', 'y'))

In [12]:
X = X[["design_id", "design_en", "yy"]].rename(columns={"yy":"y"})

In [13]:
# duplicate entries for designs are removed (after merge the information is kept in a single row for each design)
X = X.drop_duplicates("design_id",keep="first")

In [14]:
# for preprocessing (preprocessing changes the original design text)
X["design_en_changed"] = ""

In [15]:
X.shape

(1297, 4)

In [16]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [17]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        if row["class"] != "VERB":
            standard_name = row["name_en"]
            alt_names = row["alternativenames_en"].split(", ")
            for alt_name in alt_names:
                preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [18]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [19]:
for index, row in X.iterrows():
    if " I." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

## Apply Preprocessing

In [20]:
# Apply defined rules
X["design_en_changed"] = X.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1297.0, style=ProgressStyle(descriptio…




In [21]:
# Deleting brackets and questionmarks
X["design_en_changed"] = X.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1297.0, style=ProgressStyle(descriptio…




In [22]:
X.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "y":"annotations_orig"}, inplace=True)

In [23]:
# Mapping GT - prepocessing rules are also applied to GT
X["y"] = X.swifter.apply(lambda row: preprocess.preprocess_re(row["annotations_orig"], row.design_id), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1297.0, style=ProgressStyle(descriptio…




### Train the RE model

In [27]:
# optional - if already defined above
id_col_RE = "design_id"
design_col_RE = "design_en"

In [24]:
classifier = LogisticRegression(max_iter=1000)
#classifier = RandomForestClassifier()
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col_RE, design_col_RE]], X[[id_col_RE, "y"]], test_size=0.25, random_state=33)

In [26]:
X.head()

Unnamed: 0,design_id,design_en_orig,annotations_orig,design_en,y
0,9,Amphora with ribbed surface and crooked handle...,"[(amphora, OBJECT, holding, poppy, PLANT)]",Amphora with ribbed surface and crooked handle...,"[(amphora, OBJECT, holding, poppy, PLANT)]"
1,33,"Half-nude Aphrodite standing facing, head left...","[(Aphrodite, PERSON, holding, apple, PLANT)]","Half-nude Aphrodite standing facing, head left...","[(Aphrodite, PERSON, holding, apple, PLANT)]"
2,36,"Nude Aphrodite standing facing, head right, co...","[(Eros, PERSON, seated_on, dolphin, ANIMAL)]","Nude Aphrodite standing facing, head right, co...","[(Eros, PERSON, seated_on, dolphin, ANIMAL)]"
3,85,"Nude Apollo standing facing, head left, left l...","[(Apollo, PERSON, holding, patera, OBJECT), (s...","Nude Apollo standing facing, head left, left l...","[(Apollo, PERSON, holding, patera, OBJECT), (s..."
5,104,"Artemis advancing right, wearing short flutter...","[(Artemis, PERSON, wearing, chiton, OBJECT), (...","Artemis advancing right, wearing short flutter...","[(Artemis, PERSON, wearing, chiton, OBJECT), (..."


#### load pretrained NER-Model

In [31]:
# optional - if already defined above
ner_model_directory = "../cnt/trained_model/ner/english_new/"
ner_model_name = "english_cno"

#### define RE-Model path

In [32]:
# optional - if already defined above
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno_new"

In [28]:
# training
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col_RE, design_col_RE),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col_RE, design_col_RE),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col_RE))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(design_col='design_en', id_col='design_id',
                                model_dir='../cnt/trained_model/ner/english_new/',
                                model_name='english_cno')),
                ('featureextractor',
                 FeatureExtractor(design_col='design_en', id_col='design_id',
                                  model_dir='../cnt/trained_model/ner/english_new/',
                                  model_name='english_cno')),
                ('relationextractor',
                 RelationExtractor(id_col='design_id',
                                   model_name='english_cno_new',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(po

## Save and Load model

In [29]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [30]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [31]:
y_pred = model.predict(X_test)

In [32]:
metrics = Metrics()

In [33]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [34]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 84.54
Recall 61.29
F1 71.06


## Map back

In [35]:
# results (that are based on the preprocessed designs) are mapped back to the original designs

X_test["y"] = y_test["y"]

In [37]:
# Deleting brackets and questionmarks
X_test["y_mapped"] = X_test.swifter.apply(lambda row: preprocess.map_re(row["y"], row.design_id), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=325.0, style=ProgressStyle(description…




In [38]:
X_test.head(5).style

Unnamed: 0,design_id,design_en,y,y_mapped
1899,1700,"Bare-headed bust of youthful Geta, right, wear...","[(Geta, PERSON, wearing, cuirass, OBJECT), (Ge...","[(Geta, PERSON, wearing, cuirass, OBJECT), (Ge..."
179,778,"Hygieia standing right, feeding serpent held i...","[(Hygieia, PERSON, feeding, serpent staff, OBJ...","[(Hygieia, PERSON, feeding, serpent staff, OBJ..."
364,1620,"Nude bearded Heracles advancing right, graspin...","[(Heracles, PERSON, grasping, Cretan Bull, ANI...","[(Heracles, PERSON, grasping, Cretan Bull, ANI..."
1000,514,"Radiate bust of Gallienus, right, seen from be...","[(Gallienus, PERSON, wearing, cuirass, OBJECT)...","[(Gallienus, PERSON, wearing, cuirass, OBJECT)..."
2255,27717,Ares holding paludamentum.,"[(Ares, PERSON, holding, paludamentum, OBJECT)]","[(Ares, PERSON, holding, paludamentum, OBJECT)]"


## Auto relations

In [40]:
# adding relations based on adjectives - experimental
obj_list = {
"veiled": ("wearing", "Veil", "before"),
"draped": ("wearing", "Clothing", "before"),
"helmeted": ("wearing", "Helmet", "before"),
"diademed": ("wearing", "Diadem", "before"),
"turreted": ("wearing", "Mural crown", "before"),
"enthroned": ("seated_on", "Throne", "after"),

}

In [41]:
y_pred["design_en"] = X_test.design_en

In [42]:
y_pred = relations_from_adjectives_df(y_pred, "design_en", "y", ner_model_directory, ner_model_name, id_col, design_col, obj_list, entities_to_consider=["PERSON"])

In [43]:
y_pred

Unnamed: 0,design_id,y,design_en
1899,1700,"[(Geta, PERSON, wearing, cuirass, OBJECT), (Ge...","Bare-headed bust of youthful Geta, right, wear..."
179,778,[],"Hygieia standing right, feeding serpent held i..."
364,1620,[],"Nude bearded Heracles advancing right, graspin..."
1000,514,"[(Gallienus, PERSON, wearing, cuirass, OBJECT)...","Radiate bust of Gallienus, right, seen from be..."
2255,27717,"[(Ares, PERSON, holding, paludamentum, OBJECT)]",Ares holding paludamentum.
...,...,...,...
579,2036,[],"Eagle flying right, wing open, holding a serpe..."
2099,27561,[],Seleucus I holding stone.
1977,1723,"[(Elagabalus, PERSON, wearing, cuirass, OBJECT...",Radiate emperor Elagabalus in military attire ...
2100,27562,[],Olybrius wearing boot.


In [45]:
# for testing a single design

design = "Diademed Athena to the left and helmeted Ares to the right, holding sword."
# rule based - see above 
auto_relations = relations_from_adjectives_single(design,ner_model_directory, ner_model_name, id_col_RE, design_col_RE, obj_list)
# results of the model
model_relations = predict_re_single_sentence(model, design, id_col_RE, design_col_RE)
# merging both results
concat_relations(auto_relations, model_relations)

[('Ares', 'PERSON', 'wearing', 'Helmet', 'OBJECT'),
 ('Athena', 'PERSON', 'wearing', 'Diadem', 'OBJECT'),
 ('Athena', 'PERSON', 'holding', 'sword', 'OBJECT'),
 ('Ares', 'PERSON', 'holding', 'sword', 'OBJECT')]

In [46]:
# optional -only if results need to be uploaded to database
# preprocessing
upload = True
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection")
cnt_designs = dc.load_designs_from_db("data_designs", [id_col, design_col])
cnt_designs = cnt_designs.rename(columns={"id": "design_id"})

cnt_designs["design_en_changed"] = cnt_designs.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)
cnt_designs["design_en_changed"] = cnt_designs.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=7596.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=7596.0, style=ProgressStyle(descriptio…




In [47]:
# optional -only if results need to be uploaded to database
cnt_designs = cnt_designs.rename(columns={"design_en": "design_en_org"})
cnt_designs = cnt_designs.rename(columns={"design_en_changed": "design_en"})

In [None]:
# optional -only if results need to be uploaded to database - TODO: check by Sebastian on BDL-Server
upload = True
if upload ==True:
    cnt_pred = model.predict(cnt_designs)
    cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object",
                     "Label_Object"])
    cnt_pipeline_output.to_sql("cnt_pipeline_output_org",dc.mysql_connection,if_exists="replace", index=False)

In [None]:
cnt_designs.head()