developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor, save_pipeline, load_pipeline
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_ger)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_de"

### Load yaml file with annotated data

In [4]:
import yaml
import_path = "../data/German_RE_data.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [5]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('halten', 1099),
 ('tragen', 786),
 ('stützen', 216),
 ('sitzen', 68),
 ('bekränzen', 29),
 ('stehen', 26),
 ('winden', 17),
 ('füttern', 11),
 ('ausgießen', 9),
 ('drücken', 5),
 ('hängen', 5),
 ('brechen', 4),
 ('schöpfen', 2),
 ('säugen', 2)]

In [6]:
# Database access in text file: "mysql+mysqlconnector://root:YourConnection" -> Format user:password@IP/Database
f = open("/home/bigdatalab/Projects/D4N4/NLP_release_1.0/db_access.txt", "r")
access = f.read()
dc =  Database_Connection(access)

In [7]:
language = "_ger"
add_columns = ["name"+language, "alternativenames"+language]

In [8]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name_german", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [9]:
X_list, y_list = labeling_ger(d, entities)

In [10]:
X = pd.DataFrame({design_col: X_list, "y" : y_list})

In [11]:
X.shape

(1048, 2)

In [12]:
X[id_col] = X.index

In [13]:
X.head(5)

Unnamed: 0,Design,y,DesignID
0,"Halbnackte Aphrodite stehend von vorn, Kopf na...","[(Aphrodite, PERSON, halten, Apfel, PLANT)]",0
1,Brustbild des Apollon nach rechts mit Lorbeerk...,"[(Apollon, PERSON, tragen, Lorbeerkranz, OBJEC...",1
2,Kopf des Apollon mit Lorbeerkranz nach rechts;...,"[(Apollon, PERSON, tragen, Lorbeerkranz, OBJECT)]",2
3,"Nackter Apollon stehend von vorn, Kopf nach li...","[(Apollon, PERSON, halten, Patera, OBJECT)]",3
4,"Artemis im kurzen, wehenden Chiton und in Stie...","[(Artemis, PERSON, tragen, Chiton, OBJECT), (A...",4


### Train the RE model

In [14]:
classifier = LogisticRegression(max_iter=1000)
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, design_col]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [16]:
ner_model_directory = "../cnt/trained_model/ner/german/"
ner_model_name = "german_cno"

#### define RE-Model path

In [17]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "german_cno"

In [18]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('nertransformer',
                 NERTransformer(design_col='Design', id_col='DesignID',
                                model_dir='../cnt/trained_model/ner/german/',
                                model_name='german_cno')),
                ('featureextractor',
                 FeatureExtractor(design_col='Design', id_col='DesignID',
                                  model_dir='../cnt/trained_model/ner/german/',
                                  model_name='german_cno')),
                ('relationextractor',
                 RelationExtractor(id_col='DesignID', model_name='german_cno',
                                   output_dir='../cnt/trained_model/re/',
                                   pipeline=Pipeline(steps=[('pipeline',
                                                             Pipeline(steps=[('path2str',
                                                                              Path2Str(pos=True)),
                                                   

## Save and Load model

In [19]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [20]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [21]:
y_pred = model.predict(X_test)

In [22]:
metrics = Metrics()

In [23]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [24]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 87.26
Recall 80.79
F1 83.9


## Prediction dataframe

In [25]:
pre_df = X_test.merge(y_pred, left_on=id_col, right_on =id_col)

In [26]:
pre_df

Unnamed: 0,DesignID,Design,y
0,149,"Preistisch, von vorn gesehen, darauf zwei Prei...",[]
1,853,"Demeter und Homonoia, sich die Rechte reichend...","[(Demeter, PERSON, halten, Fackel, OBJECT), (H..."
2,989,"Kopf der Tyche nach rechts, mit Mauerkrone und...","[(Tyche, PERSON, tragen, Mauerkrone, OBJECT), ..."
3,34,"Demeter nach links, in der Rechten Ähren und M...","[(Demeter, PERSON, halten, Mohnkopf, PLANT), (..."
4,964,"Amazone von Smyrna und Tyche von Perinthos, si...","[(Tyche, PERSON, halten, Steuerruder, OBJECT)]"
...,...,...,...
257,41,"Kopf des Dionysos mit Efeukranz nach links, im...","[(Dionysos, PERSON, tragen, Efeukranz, OBJECT)]"
258,666,Kopf der Demeter nach rechts mit Ährenkranz un...,"[(Demeter, PERSON, tragen, Ährenkranz, OBJECT)..."
259,505,Demeter mit Ährenkranz nach links auf Korb sit...,"[(Demeter, PERSON, tragen, Ährenkranz, OBJECT)..."
260,353,Artemis im kurzen Chiton und in Stiefeln nach ...,"[(Artemis, PERSON, tragen, Chiton, OBJECT), (A..."
