In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
import swifter

from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from cnt.preprocess import Preprocess


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [4]:
id_col = "id"
design_col = "design_en"

In [5]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [6]:
train = dc.create_own_query("""select design_id, 
(select design_en from nlp_training_designs as nlp where re.design_id=nlp.id) as design_en,
(select name_en from nlp_list_entities as ner where ner.id=re.subject) as s, 
(select class from nlp_list_entities as ner where ner.id=re.subject) as subject_class, 
(select name_en from nlp_list_entities as ner where ner.id=re.predicate) as p, 
(select name_en from nlp_list_entities as ner where ner.id=re.object) as o, 
(select class from nlp_list_entities as ner where ner.id=re.object) as object_class
from nlp_relation_extraction_en as re;""")

In [7]:
train.head(5)

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class
0,9,Amphora with ribbed surface and crooked handle...,amphora,OBJECT,holding,poppy,PLANT
1,33,"Half-nude Aphrodite standing facing, head left...",Aphrodite,PERSON,holding,apple,PLANT
2,36,"Nude Aphrodite standing facing, head right, co...",Eros,PERSON,seated_on,dolphin,ANIMAL
3,85,"Nude Apollo standing facing, head left, left l...",Apollo,PERSON,holding,patera,OBJECT
4,85,"Nude Apollo standing facing, head left, left l...",serpent staff,OBJECT,coiling,omphalos,OBJECT


In [8]:
train["y"] = train.apply(lambda row: [(row.s, row.subject_class, row.p, row.o, row.object_class)], axis=1)

In [9]:
train.head(2)

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class,y
0,9,Amphora with ribbed surface and crooked handle...,amphora,OBJECT,holding,poppy,PLANT,"[(amphora, OBJECT, holding, poppy, PLANT)]"
1,33,"Half-nude Aphrodite standing facing, head left...",Aphrodite,PERSON,holding,apple,PLANT,"[(Aphrodite, PERSON, holding, apple, PLANT)]"


In [10]:
tmp = train.groupby("design_id").agg({"y": "sum"})

In [11]:
tmp.loc[tmp.index==1706].style

Unnamed: 0_level_0,y
design_id,Unnamed: 1_level_1
1706,"[('Caracalla', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Caracalla', 'PERSON', 'holding', 'scroll', 'OBJECT'), ('Geta', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Geta', 'PERSON', 'holding', 'scroll', 'OBJECT')]"


In [12]:
X = train.drop_duplicates("design_id",keep="first")

In [13]:
X = X.merge(tmp, left_on="design_id", right_on="design_id", suffixes=('', 'y'))

In [14]:
X = X[["design_id", "design_en", "yy"]].rename(columns={"yy":"y"})

In [15]:
X["design_en_changed"] = ""

In [16]:
X.shape

(998, 4)

In [17]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [18]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [19]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [20]:
for index, row in X.iterrows():
    if " I." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

## Apply Preprocessing

In [21]:
# Apply defined rules
X["design_en_changed"] = X.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 998/998 [00:21<00:00, 46.72it/s]


In [22]:
# Deleting brackets and questionmarks
X["design_en_changed"] = X.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 998/998 [00:00<00:00, 179322.08it/s]


In [23]:
X.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "y":"annotations_orig"}, inplace=True)

In [24]:
# Mapping GT
X["y"] = X.swifter.apply(lambda row: preprocess.preprocess_re(row["annotations_orig"], row.design_id), axis=1)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 998/998 [00:00<00:00, 101585.09it/s]


### Train the RE model

In [25]:
id_col = "design_id"
design_col = "design_en"

In [26]:
classifier = LogisticRegression(max_iter=1000)
#classifier = RandomForestClassifier()
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, design_col]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [28]:
ner_model_directory = "../cnt/trained_model/ner/english_new/"
ner_model_name = "english_cno"

#### define RE-Model path

In [29]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno"

In [30]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(design_col='design_en', id_col='design_id',
                                model_dir='../cnt/trained_model/ner/english/',
                                model_name='english_cno')),
                ('featureextractor',
                 FeatureExtractor(design_col='design_en', id_col='design_id',
                                  model_dir='../cnt/trained_model/ner/english/',
                                  model_name='english_cno')),
                ('relationextractor',
                 RelationExtractor(id_col='design_id', model_name='english_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                     

## Save and Load model

In [31]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [32]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [33]:
y_pred = model.predict(X_test)

In [34]:
metrics = Metrics()

In [35]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [36]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 85.43
Recall 59.69
F1 70.28


## Map back

In [37]:
X_test["y"] = y_test["y"]

In [38]:
# Deleting brackets and questionmarks
X_test["y_mapped"] = X_test.swifter.apply(lambda row: preprocess.map_re(row["y"], row.design_id), axis=1)

Pandas Apply: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 89354.58it/s]


In [39]:
X_test.head(5)

Unnamed: 0,design_id,design_en,y,y_mapped
557,955,"Radiate bust of Philippus iunior, right, weari...","[(Philippus II, PERSON, wearing, cuirass, OBJE...","[(Philippus II, PERSON, wearing, cuirass, OBJE..."
207,1694,"At left, Demeter standing facing, head right, ...","[(Demeter, PERSON, wearing, corn wreath, OBJEC...","[(Demeter, PERSON, wearing, corn wreath, OBJEC..."
684,24771,"River-god and city goddess; to left, river-god...","[(river-god, PERSON, holding, reed, PLANT), (r...","[(river-god, PERSON, holding, reed, PLANT), (r..."
4,104,"Artemis advancing right, wearing short flutter...","[(Artemis, PERSON, wearing, chiton, OBJECT), (...","[(Artemis, PERSON, wearing, chiton, OBJECT), (..."
826,2277,Façade of a hexastyle temple on crepidoma with...,"[(Tyche, PERSON, wearing, kalathos, OBJECT), (...","[(Tyche, PERSON, wearing, kalathos, OBJECT), (..."


## Auto relations

In [50]:
obj_list = {
"veiled": ("wearing", "Veil", "before"),
"draped": ("wearing", "Clothing", "before"),
"helmeted": ("wearing", "Helmet", "before"),
"diademed": ("wearing", "Diadem", "before"),
"turreted": ("wearing", "Mural crown", "before"),
"enthroned": ("seated_on", "Throne", "after"),

}

In [51]:
y_pred["design_en"] = X_test.design_en

In [52]:
y_pred = relations_from_adjectives_df(y_pred, "design_en", "y", ner_model_directory, ner_model_name, id_col, design_col, obj_list, entities_to_consider=["PERSON"])

In [58]:
y_pred

Unnamed: 0,design_id,y,design_en
557,955,[],"Radiate bust of Philippus iunior, right, weari..."
207,1694,"[(Demeter, PERSON, wearing, wreath, OBJECT), (...","At left, Demeter standing facing, head right, ..."
684,24771,"[(city goddess, PERSON, holding, patera, OBJECT)]","River-god and city goddess; to left, river-god..."
4,104,"[(Artemis, PERSON, wearing, chiton, OBJECT), (...","Artemis advancing right, wearing short flutter..."
826,2277,[],Façade of a hexastyle temple on crepidoma with...
...,...,...,...
502,691,"[(Hermes, PERSON, wearing, petasus, OBJECT)]","Head of youthful Hermes, right, wearing a flat..."
501,690,"[(Hermes, PERSON, wearing, petasus, OBJECT)]","Head of youthful Hermes facing, slightly left,..."
260,1932,[],"Nude youth Dioscur standing facing, head right..."
41,383,"[(Dionysus, PERSON, wearing, ivy wreath, OBJECT)]","Head of youthful Dionysus, right, wearing ivy ..."


In [57]:
design = "Diademed Athena to the left and helmeted Ares to the right, holding swo."
auto_relations = relations_from_adjectives_single(design,ner_model_directory, ner_model_name, id_col, design_col, obj_list)
model_relations = predict_re_single_sentence(model, design, id_col, design_col)
concat_relations(auto_relations, model_relations)

[('Athena', 'PERSON', 'wearing', 'Diadem', 'OBJECT')]