developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng, labeling_single_entity)
from cnt.extract_relation_single_verb import (path, NERTransformer, FeatureExtractor, RelationExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"

### Load yaml file with annotated data

In [3]:
import yaml
import_path = "../data/English_RE_data_Subj-Verb.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [4]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('holding', 679),
 ('standing', 564),
 ('wearing', 369),
 ('resting', 189),
 ('seated', 93),
 ('advancing', 71),
 ('enthroned', 68),
 ('curling', 65),
 ('riding', 45),
 ('reclining', 34),
 ('lying', 33),
 ('leaping', 31),
 ('coiling', 26),
 ('prancing', 25),
 ('sailing', 16),
 ('swimming', 15),
 ('drawing', 13),
 ('leaning', 13),
 ('crowning', 12),
 ('raising', 12),
 ('feeding', 11),
 ('flying', 9),
 ('grasping', 8),
 ('strangling', 8),
 ('kneeling', 5),
 ('extending', 4),
 ('pushing', 4),
 ('sitting', 4),
 ('clasping', 3),
 ('crossing', 3),
 ('galloping', 3),
 ('receiving', 3),
 ('running', 3),
 ('setting', 3),
 ('carrying', 2),
 ('creeps', 2),
 ('jumping', 2),
 ('containing', 1),
 ('covering', 1),
 ('ears', 1),
 ('forming', 1),
 ('galopping', 1),
 ('leaned', 1),
 ('playing', 1),
 ('ploughing', 1),
 ('walking', 1)]

In [5]:
# Database access in text file: "mysql+mysqlconnector://root:YourConnection" -> Format user:password@IP/Database
f = open("/home/bigdatalab/Projects/D4N4/NLP_release_1.0/db_access.txt", "r")
access = f.read()
dc =  Database_Connection(access)

In [6]:
language = "_en"
add_columns = ["name"+language, "alternativenames"+language]

In [7]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [8]:
X_list, y_list = labeling_single_entity(d, entities)

In [9]:
X = pd.DataFrame({design_col: X_list, "y" : y_list}).rename(columns={"design_en": "Design"})

In [10]:
X.shape

(987, 2)

In [11]:
X["DesignID"] = X.index

In [12]:
X.head(5)

Unnamed: 0,Design,y,DesignID
0,Amphora with ribbed surface and crooked handle...,"[(Amphora, OBJECT, containing, holding)]",0
1,"Bust of youthful Anchialos, right, wearing tae...","[(Anchialos, PERSON, wearing, wearing)]",1
2,"Bare-headed bust of Antoninus Pius, right, wea...","[(Antoninus Pius, PERSON, wearing, wearing)]",2
3,"Laureate bust of Antoninus Pius, right, wearin...","[(Antoninus Pius, PERSON, wearing, wearing)]",3
4,"Laureate bust of Antoninus Pius, right, wearin...","[(Antoninus Pius, PERSON, wearing, wearing)]",4


### Train the RE model

In [13]:
classifier = SVC()
string_converter = Path2Str(ent=True)
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [14]:
design_col = "Design"
id_col = "DesignID"

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, "Design"]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [16]:
ner_model_directory = "../cnt/trained_model/ner/english/"
ner_model_name = "english_cno"

#### define RE-Model path

In [20]:
re_model_directory = "../cnt/trained_model/re/"
re_model_name = "english_cno"

In [18]:
design_col = "Design"
id_col = "DesignID"

In [None]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))
pipeline.fit(X_train, y_train)

## Save and Load model

In [21]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [21]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [23]:
y_pred = model.predict(X_test)

In [24]:
metrics = Metrics()

In [25]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [26]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 89.58
Recall 79.82
F1 84.42


## Upload to mysql

In [27]:
upload = False

In [28]:
cnt_designs = dc.load_designs_from_db("data_designs", ["id", "design_en"])
cnt_designs.rename(columns={"design_en":"Design", "id": "DesignID"}, inplace=True) # if english
cnt_pred = pipeline.predict(cnt_designs, rdf=True)
cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                for relation in relation_list],
        columns=["DesignID", "Person", "Relation", "Object"])
if upload==True:
    cnt_pipeline_output.to_sql("CNO.cnt_pipeline_output",dc.mysql_connection,if_exists="replace", index=False)

In [None]:
cnt_pipeline_output.head(50)

Unnamed: 0,DesignID,Person,Relation,Object
0,10,Anchialos,wearing,rdf:nil
1,24,Antoninus Pius,wearing,rdf:nil
2,27,Antoninus Pius,wearing,rdf:nil
3,28,Antoninus Pius,wearing,rdf:nil
4,33,Aphrodite,standing,rdf:nil
5,33,Aphrodite,holding,rdf:nil
6,36,Aphrodite,standing,rdf:nil
7,38,Apollo,advancing,rdf:nil
8,38,Apollo,holding,rdf:nil
9,38,Apollo,holding,rdf:nil
