In [2]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
import swifter

from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from cnt.preprocess import Preprocess


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [4]:
id_col = "id"
design_col = "design_en"

In [5]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [6]:
train = dc.create_own_query("""select design_id, 
(select design_en from nlp_training_designs as nlp where re.design_id=nlp.id) as design_en,
(select name_en from nlp_list_entities as ner where ner.id=re.subject) as s, 
(select class from nlp_list_entities as ner where ner.id=re.subject) as subject_class, 
(select name_en from nlp_list_entities as ner where ner.id=re.predicate) as p, 
(select name_en from nlp_list_entities as ner where ner.id=re.object) as o, 
(select class from nlp_list_entities as ner where ner.id=re.object) as object_class
from nlp_relation_extraction_en as re;""")

SQL query failed.


In [7]:
train

In [None]:
train["y"] = train.apply(lambda row: [(row.s, row.subject_class, row.p, row.o, row.object_class)], axis=1)

In [None]:
train.head(2)

In [None]:
tmp = train.groupby("design_id").agg({"y": "sum"})

In [None]:
tmp.loc[tmp.index==1706].style

In [None]:
X = train.drop_duplicates("design_id",keep="first")

In [None]:
X = X.merge(tmp, left_on="design_id", right_on="design_id", suffixes=('', 'y'))

In [None]:
X = X[["design_id", "design_en", "yy"]].rename(columns={"yy":"y"})

In [None]:
X["design_en_changed"] = ""

In [None]:
X.shape

In [16]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [17]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [18]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [19]:
for index, row in X.iterrows():
    if " I." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

## Apply Preprocessing

In [20]:
# Apply defined rules
X["design_en_changed"] = X.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)

Pandas Apply:   0%|          | 0/849 [00:00<?, ?it/s]

In [21]:
# Deleting brackets and questionmarks
X["design_en_changed"] = X.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

Pandas Apply:   0%|          | 0/849 [00:00<?, ?it/s]

In [22]:
X.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "y":"annotations_orig"}, inplace=True)

In [23]:
# Mapping GT
X["y"] = X.swifter.apply(lambda row: preprocess.preprocess_re(row["annotations_orig"], row.design_id), axis=1)

Pandas Apply:   0%|          | 0/849 [00:00<?, ?it/s]

### Train the RE model

In [24]:
id_col = "design_id"
design_col = "design_en"

In [25]:
classifier = LogisticRegression(max_iter=1000)
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, design_col]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [27]:
ner_model_directory = "../cnt/trained_model/ner/english/"
ner_model_name = "english_cno"

#### define RE-Model path

In [28]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno"

In [29]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(design_col='design_en', id_col='design_id',
                                model_dir='../cnt/trained_model/ner/english/',
                                model_name='english_cno')),
                ('featureextractor',
                 FeatureExtractor(design_col='design_en', id_col='design_id',
                                  model_dir='../cnt/trained_model/ner/english/',
                                  model_name='english_cno')),
                ('relationextractor',
                 RelationExtractor(id_col='design_id', model_name='english_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                     

## Save and Load model

In [31]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [32]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [33]:
y_pred = model.predict(X_test)

In [34]:
metrics = Metrics()

In [35]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [36]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 84.98
Recall 64.61
F1 73.41


## Map back

In [41]:
X_test["y"] = y_test["y"]

In [42]:
# Deleting brackets and questionmarks
X_test["y_mapped"] = X_test.swifter.apply(lambda row: preprocess.map_re(row["y"], row.design_id), axis=1)

Pandas Apply:   0%|          | 0/213 [00:00<?, ?it/s]

In [43]:
X_test.head(5)

Unnamed: 0,design_id,design_en,y,y_mapped
634,6825,"Bearded Heracles standing right, lion skin ove...","[(Heracles, PERSON, resting_on, club, OBJECT),...","[(Heracles, PERSON, resting_on, club, OBJECT),..."
457,702,"Head of Hermes, right, wearing wing petasus; t...","[(Hermes, PERSON, wearing, petasus, OBJECT)]","[(Hermes, PERSON, wearing, petasus, OBJECT)]"
652,6898,"Nude Apollo standing right, holding bow in lef...","[(Apollo, PERSON, holding, bow, OBJECT)]","[(Apollo, PERSON, holding, bow, OBJECT)]"
359,245,"Wreath bust of Caracalla, right, wearing cuira...","[(Caracalla, PERSON, wearing, cuirass, OBJECT)...","[(Caracalla, PERSON, wearing, cuirass, OBJECT)..."
599,6658,"Dionysus standing facing, head left, holding c...","[(Dionysus, PERSON, holding, cantharus, OBJECT...","[(Dionysus, PERSON, holding, cantharus, OBJECT..."


## Auto relations

In [None]:
y_pred["design_en"] = X_test.design_en

In [None]:
y_pred = relations_from_adjectives_df(y_pred, "design_en", "y", ner_model_directory, ner_model_name, id_col, design_col, obj_list, entities_to_consider=["PERSON"])

In [None]:
y_pred.loc[y_pred.design_en.str.contains("Veiled")]

In [None]:
design = "Diademed Alexander the Great to the left and helmeted Athena to the right."
auto_relations = relations_from_adjectives_single(design,ner_model_directory, ner_model_name, id_col, design_col, obj_list)
model_relations = predict_re_single_sentence(model, design, id_col, design_col)
concat_relations(auto_relations, model_relations)