developed by Patricia Klinger, modified by Sebastian Gampe - last modified by Chris Deligio and Kerim Gencer (2020)

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation)
from cnt.io import (load_entities_from_db, load_designs, 
                    load_ocre_designs, replace_left_right, load_designs_german)
from cnt.train_test import train_test_annotate
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import score_precision_recall, score_accuracy
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec, Doc2Vec)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
import spacy
import datetime
from itertools import product
from tqdm import tqdm_notebook
from src.functions import labeling, labeling_german
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
sns.set_context({"figure.figsize": (15, 7.5)})
import warnings
warnings.filterwarnings('ignore')

### Load yaml file with annotated data

In [2]:
import yaml
my_path = "C:/Users/chris/Documents/Dev/Masterarbeitv3/"
import_path = my_path+ "/data/raw/final_version_german.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [3]:
relation_counts = {}
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1
sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('halten', 1099),
 ('tragen', 786),
 ('stützen', 216),
 ('sitzen', 68),
 ('bekränzen', 29),
 ('stehen', 26),
 ('winden', 17),
 ('füttern', 11),
 ('ausgießen', 9),
 ('drücken', 5),
 ('hängen', 5),
 ('brechen', 4),
 ('schöpfen', 2),
 ('säugen', 2)]

In [4]:
X_list, y_list = labeling_german(d)

In [5]:
X = pd.DataFrame({"Design": X_list, "y" : y_list})

In [6]:
X = X.rename(columns={'Design': 'DesignEng'})

In [7]:
X['DesignID'] = X.index

In [8]:
X.shape

(1048, 3)

### Train the RE model

In [9]:
classifier = SVC(kernel='linear')
string_converter = Path2Str(pos=True)
vectorizer = CountVectorizer(ngram_range=(2,3))
feature = make_pipeline(string_converter, vectorizer)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X[["DesignID", "DesignEng"]], X[["DesignID", "y"]], test_size=0.25, random_state=2)

#### load pretrained NER-Model

In [11]:
path = "C:/Users/chris/Documents/Dev/Masterarbeitv3/cnt/trained_model/cnt/german/"

In [12]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(path, 'Design'),
                         FeatureExtractor(),
                         RelationExtractor(inner_pipeline))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [13]:
precision, recall = score_precision_recall(y_test, y_pred)

In [14]:
F1 = (2*precision*recall) / (precision + recall)

In [15]:
precision

0.8945147679324894

In [16]:
recall

0.846307385229541

In [17]:
F1

0.8697435897435898

### Analyse dependancyparser

In [18]:
#Single sentence Dep-Tree
from spacy import displacy

path = "C:/Users/chris/Documents/Dev/Masterarbeitv3/cnt/trained_model/cnt/german/"
nlp = spacy.load(path)

doc = nlp("Athena nach links thronend, in der vorgestreckten Rechten Patera haltend, aus der sie eine sich um einen Baum ringelnde Schlange vor ihr füttert, und die Linke am Thronsitz lehnend; hinter ihr frontaler Schild, darauf Eule nach links, Kopf von vorn; Thron mit Sphinx nach links und Löwenfüßen verziert")
displacy.serve(doc, style="dep", options={"distance": 90})


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
