developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor, save_pipeline, load_pipeline
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

### Load yaml file with annotated data

In [2]:
import yaml
import_path = "../data/English_RE_data.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [3]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('holding', 1113),
 ('wearing', 781),
 ('resting_on', 238),
 ('seated_on', 88),
 ('grasping', 36),
 ('standing', 36),
 ('crowning', 14),
 ('feeding', 10),
 ('coiling', 7),
 ('breaking', 4),
 ('pushing', 3),
 ('flying_over', 2),
 ('receiving', 2),
 ('escorted_by', 1)]

In [4]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [6]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", ["name", "alternativenames"], ["alternativenames"], ",", True)}

In [7]:
X_list, y_list = labeling_eng(d, entities)

In [8]:
X = pd.DataFrame({"Design": X_list, "y" : y_list})

In [9]:
X.shape

(1029, 2)

In [10]:
X['DesignID'] = X.index

In [11]:
X.head(5)

Unnamed: 0,Design,y,DesignID
0,Amphora with ribbed surface and crooked handle...,"[(Amphora, OBJECT, holding, poppy, PLANT), (Am...",0
1,"Half-nude Aphrodite standing facing, head left...","[(Aphrodite, PERSON, holding, apple, PLANT)]",1
2,"Nude Aphrodite standing facing, head right, co...","[(Eros, PERSON, seated_on, dolphin, ANIMAL)]",2
3,"Nude Apollo standing facing, head left, left l...","[(Apollo, PERSON, holding, patera, OBJECT), (s...",3
4,"Artemis advancing right, wearing short flutter...","[(Artemis, PERSON, wearing, chiton, OBJECT), (...",4


### Train the RE model

In [12]:
classifier = LogisticRegression(max_iter=1000)
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X[["DesignID", "Design"]], X[["DesignID", "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [17]:
ner_model_directory = "../cnt/trained_model/ner/english/"
ner_model_name = "english_cno"

#### define RE-Model path

In [18]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno"

In [19]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name),
                         FeatureExtractor(),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(model_dir='../cnt/trained_model/ner/english/',
                                model_name='english_cno')),
                ('featureextractor', FeatureExtractor()),
                ('relationextractor',
                 RelationExtractor(model_name='english_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                                                             ('countvectorizer',
                                                                              CountVectorizer(ngram_range=(1,
                                                                                                           3)))])),
          

## Save and Load model

In [20]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [21]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [22]:
y_pred = model.predict(X_test)

In [23]:
metrics = Metrics()

In [24]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [25]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 89.31
Recall 83.74
F1 86.44


## Prediction dataframe

In [26]:
pre_df = X_test.merge(y_pred, left_on="DesignID", right_on ="DesignID")

In [27]:
pre_df

Unnamed: 0,DesignID,Design,y
0,948,"Tyche standing facing, head left, wearing kala...","[(Tyche, PERSON, wearing, kalathos, OBJECT), (..."
1,779,"Apollo standing left, holding plectrum and lyre.","[(Apollo, PERSON, holding, plectrum, OBJECT), ..."
2,241,"Turreted Cybele seated, head right, on lion ju...","[(Cybele, PERSON, seated_on, lion, ANIMAL), (C..."
3,347,"Heracles kneeling right, wearing lion skin, dr...","[(Heracles, PERSON, wearing, lion skin, OBJECT)]"
4,388,"Nude Apollo standing left, holding patera in o...","[(Apollo, PERSON, holding, patera, OBJECT)]"
...,...,...,...
253,781,"Hygieia wearing double chiton standing facing,...","[(Hygieia, PERSON, feeding, serpent, ANIMAL)]"
254,37,"Demeter standing facing, head left, holding ea...","[(Demeter, PERSON, holding, ears of corn, PLAN..."
255,437,"Laureate bust of Caracalla, left, wearing cuir...","[(Caracalla, PERSON, wearing, cuirass, OBJECT)..."
256,271,"Lion crouching left, breaking spear in jaws; u...","[(Lion, ANIMAL, breaking, spear, OBJECT)]"


## Upload to mysql

In [28]:
upload = False

In [29]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", ["DesignID", "DesignEng"])
    cnt_designs.rename(columns={"DesignEng":"Design"}, inplace=True) # if english
    cnt_pred = pipeline.predict(cnt_designs)
    cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object",
                     "Label_Object"])
    cnt_pipeline_output.to_sql("CNO.cnt_pipeline_output",dc.mysql_connection,if_exists="replace", index=False)