In [58]:
import sys
sys.path.append("../")
import pandas as pd
import random
import os
import numpy as np
from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [None]:
dc =  Database_Connection("mysql+mysqlconnector://...") # Format user:password@IP/Database

In [60]:
id_col = "id"
design_col = "design_en"

In [61]:
import yaml
import_path = "../data/English_RE_data.yaml"
with open(import_path, encoding='utf8') as f:
    dictionary = yaml.safe_load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [62]:
relation_counts = {}
labels = []
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('holding', 1113),
 ('wearing', 781),
 ('resting_on', 238),
 ('seated_on', 88),
 ('grasping', 36),
 ('standing', 36),
 ('crowning', 14),
 ('feeding', 10),
 ('coiling', 7),
 ('breaking', 4),
 ('pushing', 3),
 ('flying_over', 2),
 ('receiving', 2),
 ('escorted_by', 1)]

In [63]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [64]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [65]:
X_list, y_list = labeling_eng(d, entities)

In [66]:
X = pd.DataFrame({design_col: X_list, "y" : y_list})

In [67]:
X.shape

(1029, 2)

In [68]:
X[id_col] = X.index

In [69]:
def get_id(design):
    try:
        return dc.create_own_query("select id from nlp_training_designs where design_en='"+design+"';").id.item()
    except:
        return "Null"

In [None]:
%%capture
X["db_id"] = X.apply(lambda row: get_id(row.design_en), axis=1)

In [71]:
X = X.loc[X.db_id!="Null"]

In [72]:
X.shape

(34, 4)

In [78]:
X.head(5)

Unnamed: 0,design_en,y,id,db_id
20,"Veiled and draped bust of Demeter, right, wear...","[(Demeter, PERSON, wearing, corn wreath, OBJECT)]",20,24667
84,"Heracles advancing left, holding transverse li...","[(Heracles, PERSON, holding, torch, OBJECT), (...",84,24668
103,"Nike standing right in biga, holding palm bran...","[(Nike, PERSON, holding, palm branch, OBJECT)]",103,24670
107,"Perseus and Andromeda; at left, Andromeda stan...","[(Perseus, PERSON, standing, Cetus, ANIMAL)]",107,24671
110,"Prow with naval ram in shape of animal's head,...","[(Marcus Aurelius, PERSON, holding, parazonium...",110,24672


In [79]:
entities = dc.create_own_query("select * from nlp_list_entities;")

In [80]:
entities[["id", "name_en", "alternativenames_en"]]

Unnamed: 0,id,name_en,alternativenames_en
0,1,Abundantia,
1,2,Actaeon,
2,3,Aemilian,
3,4,Aeneas,
4,5,Aequitas,Equitas
...,...,...,...
844,845,turning,
845,846,sailing,
846,847,escorted_by,escorted by
847,848,wearing,


In [81]:
ent_dict = {}
for index, row in entities[["id", "name_en", "alternativenames_en"]].iterrows():
    ent_dict[row["id"]] = (row["name_en"]+", "+row["alternativenames_en"]).replace(", None","").lower()

In [82]:
result = pd.DataFrame(columns=["design_id", "subject", "subject_str", "predicate", "predicate_str", "object", "object_str"])

In [83]:
def get_id(ent):
    for i in ent_dict:
        if ent.lower() in ent_dict[i]:
            return i
    return ent

In [84]:
for index, row in X.iterrows():
    for y in row.y:
        subject = get_id(y[0])
        subject_str = y[0]
        predicate = get_id(y[2])
        predicate_str = y[2]
        obj = get_id(y[3])
        obj_str = y[3]
        if type(subject) == int and type(predicate) == int and type(obj) == int:
            result = result._append({"design_id": row.db_id, 
                                     "subject": subject, 
                                     "subject_str": subject_str,
                                     "predicate": predicate, 
                                     "predicate_str": predicate_str,
                                     "object": obj,
                                     "object_str": obj_str}, ignore_index=True)
        else:
            print(subject, predicate, obj)

In [85]:
result.head(5)

Unnamed: 0,design_id,subject,subject_str,predicate,predicate_str,object,object_str
0,24667,103,Demeter,848,wearing,483,corn wreath
1,24668,165,Heracles,826,holding,650,torch
2,24668,165,Heracles,826,holding,476,club
3,24668,165,Heracles,826,holding,544,lion skin
4,24668,625,statue,826,holding,498,staff


In [86]:
result.to_sql("nlp_relation_extraction_en_v2",dc.mysql_connection,if_exists="append", index=False)

64