developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "design_id"
design_col = "design_en"

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [4]:
dc =  Database_Connection("mysql+mysqlconnector://root:bigdata3090@localhost/thrakien_d4n4") # Format user:password@IP/Database

## Old

### Load yaml file with annotated data

In [5]:
import yaml
import_path = "../data/English_RE_data.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [6]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('holding', 1113),
 ('wearing', 781),
 ('resting_on', 238),
 ('seated_on', 88),
 ('grasping', 36),
 ('standing', 36),
 ('crowning', 14),
 ('feeding', 10),
 ('coiling', 7),
 ('breaking', 4),
 ('pushing', 3),
 ('flying_over', 2),
 ('receiving', 2),
 ('escorted_by', 1)]

## Old

In [7]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [8]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [9]:
X_list, y_list = labeling_eng(d, entities)

In [10]:
X = pd.DataFrame({design_col: X_list, "y" : y_list})

In [11]:
X.shape

(1029, 2)

In [12]:
X[id_col] = X.index

## New

## Train the RE model

In [13]:
classifier = LogisticRegression(max_iter=1000)
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(2,3))
feature = make_pipeline(string_converter, vectorizer)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, design_col]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [15]:
ner_model_directory = "../cnt/trained_model/ner/english/"
ner_model_name = "english_cno"

#### define RE-Model path

In [16]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno"

In [17]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(design_col='design_en', id_col='design_id',
                                model_dir='../cnt/trained_model/ner/english/',
                                model_name='english_cno')),
                ('featureextractor',
                 FeatureExtractor(design_col='design_en', id_col='design_id',
                                  model_dir='../cnt/trained_model/ner/english/',
                                  model_name='english_cno')),
                ('relationextractor',
                 RelationExtractor(id_col='design_id', model_name='english_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                     

## Save and Load model

In [18]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [19]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [20]:
y_pred = model.predict(X_test)

In [21]:
metrics = Metrics()

In [22]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [23]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 88.46
Recall 75.19
F1 81.29


## Add auto annotation

In [24]:
obj_list = {
"veiled": ("wearing", "Veil", "before"),
"draped": ("wearing", "Clothing", "before"),
"helmeted": ("wearing", "Helmet", "before"),
"diademed": ("wearing", "Diadem", "before"),
"turreted": ("wearing", "Mural crown", "before"),
"enthroned": ("seated_on", "Throne", "after"),

}

In [25]:
y_pred["design_en"] = X_test.design_en

In [26]:
y_pred = relations_from_adjectives_df(y_pred, "design_en", "y", ner_model_directory, ner_model_name, id_col, design_col, obj_list, entities_to_consider=["PERSON"])

In [27]:
y_pred.head(5).style

Unnamed: 0,design_id,y,design_en
948,948,"[('Tyche', 'PERSON', 'wearing', 'kalathos', 'OBJECT'), ('Tyche', 'PERSON', 'holding', 'patera', 'OBJECT'), ('Tyche', 'PERSON', 'holding', 'cornucopia', 'OBJECT')]","Tyche standing facing, head left, wearing kalathos, holding in right hand patera over lighted altar and cornucopia in left arm. Ground line. Border of dots."
779,779,"[('Apollo', 'PERSON', 'holding', 'plectrum', 'OBJECT'), ('Apollo', 'PERSON', 'holding', 'lyre', 'OBJECT')]","Apollo standing left, holding plectrum and lyre."
241,241,"[('Cybele', 'PERSON', 'seated_on', 'lion', 'ANIMAL'), ('Cybele', 'PERSON', 'wearing', 'Mural crown', 'OBJECT')]","Turreted Cybele seated, head right, on lion jumping right, holding tympanum in right hand, left resting on long sceptre."
347,347,[],"Heracles kneeling right, wearing lion skin, drawing bow."
388,388,"[('Apollo', 'PERSON', 'holding', 'patera', 'OBJECT')]","Nude Apollo standing left, holding patera in outstretched right hand over altar and laurel branch in left hand."


## Prediction dataframe

In [28]:
pre_df = X_test.merge(y_pred, left_on=id_col, right_on =id_col, suffixes=["", "_y"])
pre_df = pre_df[["design_en", "y"]]

In [33]:
pre_df.head(10).style

Unnamed: 0,design_en,y
0,"Tyche standing facing, head left, wearing kalathos, holding in right hand patera over lighted altar and cornucopia in left arm. Ground line. Border of dots.","[('Tyche', 'PERSON', 'wearing', 'kalathos', 'OBJECT'), ('Tyche', 'PERSON', 'holding', 'patera', 'OBJECT'), ('Tyche', 'PERSON', 'holding', 'cornucopia', 'OBJECT')]"
1,"Apollo standing left, holding plectrum and lyre.","[('Apollo', 'PERSON', 'holding', 'plectrum', 'OBJECT'), ('Apollo', 'PERSON', 'holding', 'lyre', 'OBJECT')]"
2,"Turreted Cybele seated, head right, on lion jumping right, holding tympanum in right hand, left resting on long sceptre.","[('Cybele', 'PERSON', 'seated_on', 'lion', 'ANIMAL'), ('Cybele', 'PERSON', 'wearing', 'Mural crown', 'OBJECT')]"
3,"Heracles kneeling right, wearing lion skin, drawing bow.",[]
4,"Nude Apollo standing left, holding patera in outstretched right hand over altar and laurel branch in left hand.","[('Apollo', 'PERSON', 'holding', 'patera', 'OBJECT')]"
5,"Nude Hermes seated left on rock, holding caduceus in right hand, left resting on rock.","[('Hermes', 'PERSON', 'holding', 'caduceus', 'OBJECT'), ('Hermes', 'PERSON', 'resting_on', 'rock', 'OBJECT')]"
6,"Artemis advancing right, wearing short fluttering chiton and boots, holding bow in left hand and drawing arrow from quiver at shoulder with right hand; at her feet, hound running right.","[('Artemis', 'PERSON', 'wearing', 'chiton', 'OBJECT'), ('Artemis', 'PERSON', 'wearing', 'boots', 'OBJECT'), ('Artemis', 'PERSON', 'holding', 'bow', 'OBJECT'), ('Artemis', 'PERSON', 'holding', 'arrow', 'OBJECT')]"
7,"Nude Eros in attitude of Thanatos standing right, left leg crossed over right, leaning with right hand and left elbow on inverted lit torch. Ground line. Border of dots.",[]
8,"Veiled Demeter seated left on basket, wearing corn wreath, holding two ears of corn in right hand, left resting on long torch.","[('Demeter', 'PERSON', 'seated_on', 'basket', 'OBJECT'), ('Demeter', 'PERSON', 'wearing', 'corn wreath', 'OBJECT'), ('Demeter', 'PERSON', 'resting_on', 'torch', 'OBJECT'), ('Demeter', 'PERSON', 'wearing', 'Veil', 'OBJECT')]"
9,"Laureate bust of Apollo, left, wearing chlamys; in front, laurel branch. Border of dots.","[('Apollo', 'PERSON', 'wearing', 'chlamys', 'OBJECT'), ('Apollo', 'PERSON', 'wearing', 'laurel branch', 'OBJECT')]"


In [32]:
y_test

Unnamed: 0,design_id,y
948,948,"[(Tyche, PERSON, wearing, kalathos, OBJECT), (..."
779,779,"[(Apollo, PERSON, holding, plectrum, OBJECT), ..."
241,241,[]
347,347,"[(Heracles, PERSON, wearing, lion skin, OBJECT..."
388,388,"[(Apollo, PERSON, holding, patera, OBJECT), (A..."
...,...,...
781,781,"[(Hygieia, PERSON, wearing, chiton, OBJECT), (..."
37,37,"[(Demeter, PERSON, resting_on, torch, OBJECT)]"
437,437,"[(Caracalla, PERSON, wearing, cuirass, OBJECT)..."
271,271,"[(Lion, ANIMAL, breaking, spear, OBJECT)]"


## Upload to mysql

In [30]:
upload = False

In [31]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", ["DesignID", "DesignEng"])
    cnt_designs.rename(columns={"DesignEng":"Design"}, inplace=True) # if english
    cnt_pred = pipeline.predict(cnt_designs)
    cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object",
                     "Label_Object"])
    cnt_pipeline_output.to_sql("CNO.cnt_pipeline_output",dc.mysql_connection,if_exists="replace", index=False)