developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor, save_pipeline, load_pipeline
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_ger)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

### Load yaml file with annotated data

In [2]:
import yaml
import_path = "../data/German_RE_data.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [3]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('halten', 1099),
 ('tragen', 786),
 ('stützen', 216),
 ('sitzen', 68),
 ('bekränzen', 29),
 ('stehen', 26),
 ('winden', 17),
 ('füttern', 11),
 ('ausgießen', 9),
 ('drücken', 5),
 ('hängen', 5),
 ('brechen', 4),
 ('schöpfen', 2),
 ('säugen', 2)]

In [4]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [5]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj_ger", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal_ger", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant_ger", ["name", "alternativenames"], ["alternativenames"], ",", True)}

In [6]:
X_list, y_list = labeling_ger(d, entities)

In [7]:
X = pd.DataFrame({"Design": X_list, "y" : y_list})

In [8]:
X.shape

(1048, 2)

In [9]:
X['DesignID'] = X.index

In [10]:
X.head(5)

Unnamed: 0,Design,y,DesignID
0,"Halbnackte Aphrodite stehend von vorn, Kopf na...","[(Aphrodite, PERSON, halten, Apfel, PLANT)]",0
1,Brustbild des Apollon nach rechts mit Lorbeerk...,"[(Apollon, PERSON, tragen, Lorbeerkranz, OBJEC...",1
2,Kopf des Apollon mit Lorbeerkranz nach rechts;...,"[(Apollon, PERSON, tragen, Lorbeerkranz, OBJECT)]",2
3,"Nackter Apollon stehend von vorn, Kopf nach li...","[(Apollon, PERSON, halten, Patera, OBJECT)]",3
4,"Artemis im kurzen, wehenden Chiton und in Stie...","[(Artemis, PERSON, tragen, Chiton, OBJECT), (A...",4


### Train the RE model

In [11]:
classifier = SVC(kernel='linear')
string_converter = Path2Str(pos=True)
vectorizer = CountVectorizer(ngram_range=(2,3))
feature = make_pipeline(string_converter, vectorizer)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X[["DesignID", "Design"]], X[["DesignID", "y"]], test_size=0.25, random_state=2)

#### load pretrained NER-Model

In [13]:
ner_model_directory = "../cnt/trained_model/ner/german/"
ner_model_name = "german_cno"

#### define RE-Model path

In [14]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "german_cno"

In [15]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name),
                         FeatureExtractor(),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(model_dir='../cnt/trained_model/ner/',
                                model_name='german_cno')),
                ('featureextractor', FeatureExtractor()),
                ('relationextractor',
                 RelationExtractor(model_name='german_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                                                             ('countvectorizer',
                                                                              CountVectorizer(ngram_range=(2,
                                                                                                           3)))])),
                    

## Save and Load model

In [16]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [17]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [18]:
y_pred = pipeline.predict(X_test)

In [19]:
metrics = Metrics()

In [20]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [21]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 86.79
Recall 82.63
F1 84.66


## Prediction dataframe

In [22]:
pre_df = X_test.merge(y_pred, left_on="DesignID", right_on ="DesignID")

In [23]:
pre_df

Unnamed: 0,DesignID,Design,y
0,347,"Nackter Jüngling (Dioskur) stehend von vorn, K...",[]
1,614,Athena mit korinthischem Helm nach rechts schr...,"[(Athena, PERSON, tragen, Helm, OBJECT), (Athe..."
2,1042,"Nackter Herakles, stehend von vorn, Kopf nach ...","[(Herakles, PERSON, stützen, Keule, OBJECT), (..."
3,946,"Tyche mit Mauerkrone nach links thronend, auf ...","[(Tyche, PERSON, tragen, Mauerkrone, OBJECT)]"
4,704,Kaiser (Caracalla) in Kriegsbekleidung mit Pan...,"[(Caracalla, PERSON, tragen, Kriegsbekleidung,..."
...,...,...,...
257,986,Kaiser (Geta) in Kriegsbekleidung mit Panzer u...,"[(Geta, PERSON, sitzen, Pferd, ANIMAL), (Geta,..."
258,175,"Löwe nach links hockend, einen Speer mit dem K...",[]
259,1015,Athena nach links mit Schild in der Linken; vo...,"[(Athena, PERSON, halten, Schild, OBJECT)]"
260,374,"Athena (Nikephoros) nach links thronend, im Ch...","[(Athena, PERSON, tragen, Chiton, OBJECT), (At..."
