In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
import swifter

from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from cnt.preprocess import Preprocess


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
id_col = "id"
design_col = "design_en"

In [4]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [5]:
train = dc.create_own_query("""select design_id, 
(select design_en from nlp_training_designs as nlp where re.design_id=nlp.id) as design_en,
(select name_en from nlp_list_entities as ner where ner.id=re.subject) as s, 
(select class from nlp_list_entities as ner where ner.id=re.subject) as subject_class, 
(select name_en from nlp_list_entities as ner where ner.id=re.predicate) as p, 
(select name_en from nlp_list_entities as ner where ner.id=re.object) as o, 
(select class from nlp_list_entities as ner where ner.id=re.object) as object_class
from nlp_relation_extraction_en as re;""")

In [6]:
train["y"] = train.apply(lambda row: [row.s, row.subject_class, row.p, row.o, row.object_class], axis=1)

In [7]:
tmp = train.groupby("design_id").agg({"y": "sum"})

In [8]:
X = train.drop_duplicates("design_id",keep="first")

In [9]:
X = X.merge(tmp, left_on="design_id", right_on="design_id", suffixes=('', '_y'))

In [10]:
X = X[["design_id", "design_en", "y"]]

In [11]:
X.head(5)

Unnamed: 0,design_id,design_en,y
0,9,Amphora with ribbed surface and crooked handle...,"[amphora, OBJECT, holding, poppy, PLANT]"
1,33,"Half-nude Aphrodite standing facing, head left...","[Aphrodite, PERSON, holding, apple, PLANT]"
2,36,"Nude Aphrodite standing facing, head right, co...","[Eros, PERSON, seated_on, dolphin, ANIMAL]"
3,85,"Nude Apollo standing facing, head left, left l...","[Apollo, PERSON, holding, patera, OBJECT]"
4,104,"Artemis advancing right, wearing short flutter...","[Artemis, PERSON, wearing, chiton, OBJECT]"


In [12]:
X["design_en_changed"] = ""

In [13]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [14]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [15]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [16]:
for index, row in X.iterrows():
    if " I." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

In [19]:
# Apply defined rules
X["design_en_changed"] = X.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)

Pandas Apply: 100%|███████████████████████████| 849/849 [00:18<00:00, 45.71it/s]


In [20]:
# Deleting brackets and questionmarks
X["design_en_changed"] = X.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

Pandas Apply: 100%|███████████████████████| 849/849 [00:00<00:00, 182566.73it/s]


In [21]:
X.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "annotations":"annotations_orig"}, inplace=True)

In [22]:
X.head(5)

Unnamed: 0,design_id,design_en_orig,y,design_en
0,9,Amphora with ribbed surface and crooked handle...,"[amphora, OBJECT, holding, poppy, PLANT]",Amphora with ribbed surface and crooked handle...
1,33,"Half-nude Aphrodite standing facing, head left...","[Aphrodite, PERSON, holding, apple, PLANT]","Half-nude Aphrodite standing facing, head left..."
2,36,"Nude Aphrodite standing facing, head right, co...","[Eros, PERSON, seated_on, dolphin, ANIMAL]","Nude Aphrodite standing facing, head right, ho..."
3,85,"Nude Apollo standing facing, head left, left l...","[Apollo, PERSON, holding, patera, OBJECT]","Nude Apollo standing facing, head left, left l..."
4,104,"Artemis advancing right, wearing short flutter...","[Artemis, PERSON, wearing, chiton, OBJECT]","Artemis advancing right, wearing short flutter..."
