In [1]:
# imports
import pandas as pd
import spacy
from spacy.matcher import Matcher
import networkx as nx
from tqdm import tqdm
import os, re, csv
from googletrans import Translator # don't forget to run "!pip install googletrans==3.1.0a0" before using this
import spacy_transformers # might seem not being used but it is required to run the transformers
from unidecode import unidecode
from PyPDF2 import PdfReader
import time
import nltk
from nltk.corpus import stopwords
import jaro # requires !pip install jaro-winkler
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dnaen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# functions
def find_folder_with_type(given_path, doc_type): # returns all documents found in path
    doc_list = []
    for doc in os.listdir(given_path):
        if re.search (r'.*\%s$' % doc_type, doc) is not None: # even though this shows as error in IDE it's fine
            doc_list.append(doc)
    return doc_list

def folder_to_nlp_doc(given_path, list_of_doc_names, given_nlp):
    temp_list = []
    # the input should be a list of file contained in a folder
    for file_name in list_of_doc_names:
        file_path = given_path + "\\" + file_name

        with open(file_path, "r", encoding='utf8') as my_file:
            all_text = my_file.read()
            # \xc2\xa
            # all_text = all_text.replace("\\xc2\\xa", " ")
            all_text = all_text.replace("\n", " ")
            all_text = all_text.replace("  ", " ")

            temp_list.append(given_nlp(all_text))
    return temp_list

def folder_list_to_dic(given_path, given_list): # given file names extracts their texts and saves in a dic
    dic = {}
    old_path = os.getcwd() # saving the previous working dir so we can switch back to that dir later
    os.chdir(given_path)

    # the input should be a list of file contained in a folder
    for file_name in given_list:
        print('importing', file_name, '...')
        with open("%s" % file_name, "r", encoding='utf8') as my_file:
            text = my_file.read()
        dic[file_name]= text

    os.chdir(old_path)
    return dic

def get_entities_for_english(the_file):
    """
    Here we extract the elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of dependency labels. However, when an entity spans across multiple words, then dependency labels alone are not sufficient. To fix this we basically save our previous text's info.

    requirements:
    - punctuation: ["punct"]
    - compound: ["compound", "acomp", "ccomp", "pcomp"]
    - modifier: ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
    - subject: ["csubj", "csubjpass", "nsubj", "nsubjpass"]
    - object: ["dobj", "oprd", "pobj"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["compound", "acomp", "ccomp", "pcomp"]
    modifier_parsers = ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
    subject_parsers = ["csubj", "csubjpass", "nsubj", "nsubjpass"]
    object_parsers = ["dobj", "oprd", "pobj"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):

        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower().replace("  ", " "), ent2.strip().lower().replace("  ", " ")]

def get_entities_for_french(the_file):
    """
    requirements:
    - punctuation = ["punct"]
    - compound: ["ccomp", "expl:comp", "xcomp", "fixed", "flat:foreign", "flat:name"]
    - modifier = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "nmod", "nummod", "obl:mod"]
    - subject = ["expl:subj", "nsubj", "nsubj:pass"]
    - object = ["iobj", "obj"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["ccomp", "expl:comp", "xcomp", "fixed", "flat:foreign", "flat:name"]
    modifier_parsers = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "nmod", "nummod", "obl:mod"]
    subject_parsers = ["expl:subj", "nsubj", "nsubj:pass"]
    object_parsers = ["iobj", "obj"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

def get_entities_for_german(the_file):
    """
    requirements:
    - punctuation = ["punct"]
    - compound = ["adc", "avc", "nmc", "pnc", "uc", "svp", "re", "pm", "par", "dep", "cvc", "app", "ag", "ac"]
    - modifier = ["ams", "mnr", "mo"]
    - subject = ["sb", "sbp"]
    - object = ["oa", "oc", "og", "op"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["adc", "avc", "nmc", "pnc", "uc", "svp", "re", "pm", "par", "dep", "cvc", "app", "ag", "ac"]
    modifier_parsers = ["ams", "mnr", "mo"]
    subject_parsers = ["sb", "sbp"]
    object_parsers = ["oa", "oc", "og", "op"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

def get_relation(the_sentence):
    temp_doc = nlp(the_sentence)

    # creating the rule-based Matcher object
    matcher = Matcher(nlp.vocab)

    # defining the pattern - Each pattern should be a list of dicts and each pattern should be saved in another list
    # ex: patterns = [[{"LOWER": "hello"}, {"LOWER": "world"}], [{"ORTH": "Google"}, {"ORTH": "Maps"}]]

    # This pattern tries to find the ROOT word in the sentence. Once the ROOT is identified, then the pattern checks whether it is followed by a preposition (‘prep’) or an agent word. If yes, then it is added to the ROOT word.
    pattern = [{'DEP':'ROOT'}, # check for token with dependency label root
            {'DEP':'prep','OP':"?"}, # other stuff
            {'DEP':'agent','OP':"?"},
            {'POS':'ADJ','OP':"?"}]

    # matcher.add("match_id", "patterns")
    matcher.add("matching_1", [pattern])

    matches = matcher(temp_doc)

    k = len(matches) - 1
    if k == -1: # meaning no match was found so return null
        return None

    span = temp_doc[matches[k][1]:matches[k][2]]
    return span.text

In [3]:
# COPY PASTE FROM VERSION 2!

def create_kg_csv(subjects, predicates, objects, re_type, language, given_id_list, model_used = ""):
    """
    [source(subject) --relation(predicate)--> target(object)]
    :param subjects: source
    :param predicates: relation
    :param objects: target
    :param re_type: currently, we have only relation extraction type of "simple" and "predefined-dictionary"
    :param model_used: currently only "stanford_OpenIE"
    :param language: en, fr, de
    :param given_id_list: represents the id of triple, in this case it's from which directive/legislation it came from
    :return: returns nothing only creates the csv file
    """
    # field names
    fields = [language + '_Subject', language + '_Predicate', language + '_Object', "Triple_ID"]

    if model_used:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used + "\\kg_of_" + re_type + "_" + language + ".csv"
    else:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\kg_of_" + re_type + "_" + language + ".csv"



    rows = [[subjects[i], predicates[i], objects[i], given_id_list[i]] for i in range(len(subjects))]

    # find out empty and None strings, replacing it with "-"
    for i in range(len(rows)):
        for j in range(len(rows[0])): # so len 3
            if rows[i][j] == "" or rows[i][j] is None: rows[i][j] = "-"
            rows[i][j] = unidecode(rows[i][j].lower())

    # writing to csv file
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

# German: de, French: fr, but can work for any other languages too
def simple_translator(target_language, re_type, extra_info = "", model_used = ""): # assumes that en kg csv is already created
    translator = Translator()

    if extra_info: extra_info = "_" + extra_info

    if model_used:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"
    else:
        filename = os.getcwd() + "\\triples_data\\" + re_type
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"

    with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "w+", newline = '') as file_2:
        csv_reader = csv.reader(file_original)
        csv_reader_file2 = csv.reader(file_2)
        csv_writer = csv.writer(file_2)

        current_size_of_new_kg = 0
        for step in csv_reader_file2:
            current_size_of_new_kg += 1
        print("Starting from row: " + str(current_size_of_new_kg))

        # first line is headers
        if current_size_of_new_kg == 0: csv_writer.writerow([target_language + "_Subject", target_language + "_Predicate", target_language + "_Object", "Triple_ID"])
        current_step = 0
        for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
            if current_step > current_size_of_new_kg:
                source_en, relation_en, target_en = lines[0], lines[1], lines[2]
                # src(source) = english, dest(destination) = language to translate to
                translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)

                temp_row = [unidecode(translated_source.text), unidecode(translated_relation.text), unidecode(translated_target.text), unidecode(lines[3])]
                csv_writer.writerow(temp_row)
            else:
                current_step += 1

def get_unique_words_from_triples(given_list_of_triples):
    unique_subjects = []
    unique_relations = []
    unique_objects = []

    for triple in given_list_of_triples:
        if triple[0] not in unique_subjects: unique_subjects.append(triple[0])
        if triple[1] not in unique_relations: unique_relations.append(triple[1])
        if triple[2] not in unique_objects: unique_objects.append(triple[2])

    return unique_subjects, unique_relations, unique_objects

def get_neighbouring_groups(given_text):
    text_split = given_text.split()
    group = ""
    group_list = []

    # groups of 2, groups of 3, groups of 4, ...
    for i in range(len(text_split)):
        for j in range(len(text_split) - i):
            # group_list.append([(group := group + text_split[k]) if k == 1 else (group := group + text_split[k] + " ") for k in range(1, len(text_split) - 1)][-1])
            for k in range(j, j+i+1):
                if k == j+i: group = group + text_split[k]
                else: group = group + text_split[k] + " "
            group_list.append(group)
            group = ""
    return group_list

def create_combined_eurovoc(given_lans):
    complete_data = []
    fields = ['ID']

    # adding id
    data = pd.read_csv("eurovoc_" + given_lans[0] + ".tsv",sep='\t')
    data = data.sort_values(by=["ID"])
    complete_data.append(list(data.iloc[:, 0]))

    # adding the concepts
    for lan in given_lans:
        fields.append(lan.upper())
        data = pd.read_csv("eurovoc_" + lan + ".tsv",sep='\t') # TODO what about updated version?
        data = data.sort_values(by=["ID"])
        complete_data.append(list(data.iloc[:, 1]))


    for i in range(1, len(complete_data)):
        complete_data[i] = [unidecode(x.lower()) for x in complete_data[i]]

    rows = [[complete_data[0][i], complete_data[1][i], complete_data[2][i], complete_data[3][i]] for i in range(len(complete_data[0]))] # TODO find a way to make this line work for any given number of columns

    # writing to csv file
    filename = os.getcwd() + "\\combined_eurovoc.csv"
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

def update_triple_csv_file_with_concepts(re_type, language, given_concept_list, model_used = "", extra_info = ""):
    old_path = os.getcwd()
    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    if extra_info: extra_info = "_" + extra_info
    data = pd.read_csv("kg_of_" + re_type + "_" + language + extra_info + ".csv")

    col_list = list(data.columns)
    col_list.remove("Triple_ID")
    stopwords_list = []
    if language == "en": stopwords_list = stopwords.words('english')
    if language == "fr": stopwords_list = stopwords.words('french')
    if language == "de": stopwords_list = stopwords.words('german')

    for col in col_list:
        print("Currently at column: " + col)
        new_col = col + "_concept"
        temp_col = []
        old_col = data[col].copy()

        current_index = 0
        for cell in old_col:
            if current_index % 1000 == 0: print("Row" + str(current_index), end = ", ")
            if not (cell == "-"): temp_col.append(find_corresponding_eurovoc_concept_jaro_winkler(cell, given_concept_list, stopwords_list))
            else: temp_col.append("[]")
            current_index += 1

        data[new_col] = temp_col

    # writing to csv file
    file_name = path + "\\kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv"
    data.to_csv(file_name, index=False)
    os.chdir(old_path)

def update_df_according_to_neo4j(re_type, language, model_used = "", extra_info = ""):
    old_path = os.getcwd()

    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    if extra_info: extra_info = "_" + extra_info
    data = pd.read_csv("kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv")

    col_list = [language + "_Subject_concept", language + "_Predicate_concept", language + "_Object_concept"]

    for col in col_list:
        print("Currently at column: " + col)
        list_of_col = list(data[col])
        temp_col = []

        for cell in list_of_col:
            if str(cell) == ("[]" or None): temp_col.append("-")
            else: temp_col.append(str(cell.replace(", ", ":").replace("[", "").replace("]", "").replace("\"", "").replace("'", "")))

        data[col] = temp_col

    # writing to csv file
    file_name = path + "\\kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv"
    data.to_csv(file_name, index=False)
    os.chdir(old_path)

def find_corresponding_eurovoc_concept(given_text, given_concept_list, given_stopwords_list):
    """
    The goal of this function is to find the concepts the given text might be linked to.
    :param given_text: takes in a string (this string can be a single word or a compound word (list of words)
    :param given_concept_list: the original concept list downloaded from Eurovoc (MAKE SURE THIS IS A LIST!)
    :param given_stopwords_list: a list words we don't want to look into (in our case it's stopwords)
    :return: a list of concepts it might be related to
    """
    elems_concept = []

    # such that we can also check on possible compounds by bruteforce
    possible_words_list = get_neighbouring_groups(given_text) # ex: "I need to" -> ["I", "need", "to, "I need", "need to", "I need to"]

    # TODO change the type of string measure you do
    for word in possible_words_list:
        word = word.lower()
        for concept in given_concept_list:
            concept = unidecode(concept)
            contains = False
            if len(word.split()) == len(concept.split()): # checking whether there is same number of words
                word_list = word.split()
                concept_list_1 = concept.split()

                contains = True # set it to false when a mismatch occurs
                for i in range(len(word_list)): # this was done such that we can go through compounds too (and applyiing specific operations (e.g., lemmatization) to each word)
                    current_word = word_list[i]
                    current_concept = concept_list_1[i]

                    if current_word not in given_stopwords_list:
                        concept_uni = unidecode(current_concept) # because the triples of other languages will be in unidecode
                        if current_word not in concept_uni:
                            contains = False
                    else: contains = False

            if contains: elems_concept.append(unidecode(concept))

    return elems_concept

def fix_duplicate(re_type, language, model_used = ""):
    old_path = os.getcwd()

    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    data = pd.read_csv("kg_of_" + re_type + "_" + language + ".csv")
    temp_data = data[~data.duplicated()]
    target_path = path + "\\kg_of_" + re_type + "_" + language + ".csv"
    temp_data.to_csv(target_path, index = False)
    os.chdir(old_path)

def find_corresponding_eurovoc_concept_jaro_winkler(given_text, given_concept_list, given_stopwords_list):
    """
    The goal of this function is to find the concepts the given text might be linked to.
    :param given_text: takes in a string (this string can be a single word or a compound word (list of words)
    :param given_concept_list: the original concept list downloaded from Eurovoc (MAKE SURE THIS IS A LIST!)
    :param given_stopwords_list: a list words we don't want to look into (in our case it's stopwords)
    :return: a list of concepts it might be related to
    """
    elems_concept = []

    # such that we can also check on possible compounds by bruteforce
    possible_words_list = get_neighbouring_groups(given_text) # ex: "I need to" -> ["I", "need", "to, "I need", "need to", "I need to"]

    for word in possible_words_list:
        word = word.lower()
        if word not in given_stopwords_list:
            for concept in given_concept_list:
                concept = unidecode(concept)
                if len(word.split()) == len(concept.split()): # checking whether there is same number of words
                    if jaro.jaro_winkler_metric(word, concept) > 0.95: elems_concept.append(unidecode(concept))

    return elems_concept

# PIPELINE

In [12]:
start_time = time.time()
# your own project path dir here
my_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23"

#### 1. Loading Pipeline

In [13]:
# for downloading the pipeline
# !python -m spacy download fr_dep_news_trf
# !python -m spacy download de_dep_news_trf
# !python -m spacy download en_core_web_trf
# !pip install spacy-transformers # needed to install this to run trf

In [20]:
lan = "de" # options: en, fr, de
nlp = None

if lan == "en": nlp = spacy.load("en_core_web_trf") # English transformer pipeline (roberta-base)
if lan == "fr": nlp = spacy.load("fr_dep_news_trf")
if lan == "de": nlp = spacy.load("de_dep_news_trf")


#### 2. Importing Dataset

In [21]:
### Importing Dataset ###

# automatically extracting path
current_path = os.getcwd()
data_path = my_path + "\\data\\" + lan + "\\directives_txt"

doc_list = find_folder_with_type(data_path, '.txt') # detection of pdf files in the folder

# importing the file, here doc is like a "list" of tokens (each tok is either a word, number, ...)
file_list = folder_to_nlp_doc(data_path, doc_list, nlp)



#### 3. Entities (Nodes) and Relations (Edges) Extraction

In [22]:
list_of_entity_pairs = [] # this is a list of lists (so each index contains the info of a data file)
list_of_relations = []
triple_id = []
count = 1

for file, doc_name in zip(file_list, doc_list):
    print("Document number " + str(count))
    count += 1

    sents = list(file.sents) # extract sentences of the document (only checks for ".")
    sents_pruned = [sentence for sentence in sents if len(sentence) > 5]

    print("Entities (Nodes - subject/object) Extraction")
    entity_pairs = []

    # Extracting the entity pairs of each sentence.
    if lan == "en": entity_pairs = [get_entities_for_english(str(sent)) for sent in sents_pruned]
    elif lan == "fr": entity_pairs = [get_entities_for_french(str(sent)) for sent in sents_pruned]
    elif lan == "de": entity_pairs = [get_entities_for_german(str(sent)) for sent in sents_pruned]
    else: print("Current chosen language does not exist!")

    print("Relations (Edges - predicate) Extraction")
    relations = [get_relation(str(sent)) for sent in sents_pruned] # Here we assume predicate is the main verb in a sentence.

    # pd.Series(relations).value_counts()[:10] # to visualize
    triple_id.extend([doc_name] * len(sents_pruned))
    list_of_entity_pairs.extend(entity_pairs)
    list_of_relations.extend(relations)

Document number 1
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 2
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 3
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 4
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 5
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 6
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction


#### 4. Build Knowledge Graph

In [23]:
# extract subject
source = [i[0] for i in list_of_entity_pairs]

# extract object
target = [i[1] for i in list_of_entity_pairs]

In [24]:
# temp_source = []
# temp_relation = []
# temp_target = []

source_uni = []
relation_uni = []
target_uni = []

# # for removing the soft hyphen (but can be used for removal of other stuff too)
# # this has to be done before translating to unicode because otherwise it can't detect it for some reason
# for i in range(len(source)):
#     temp_source.append(source[i].replace(" ­", "").replace("  ", " "))
#     if list_of_relations[i] is not None: temp_relation.append(list_of_relations[i].replace(" ­", "").replace("  ", " "))
#     else: temp_relation.append(list_of_relations[i])
#     temp_target.append(target[i].replace(" ­", "").replace("  ", " "))

# for making it usable in english text
for i in range(len(source)):
    source_uni.append(unidecode(source[i]))
    if list_of_relations[i] is not None: relation_uni.append(unidecode(list_of_relations[i])) # replacing None with "-"
    else: relation_uni.append("-")
    target_uni.append(unidecode(target[i]))


if len(source_uni) == len(relation_uni) == len(target_uni): # making sure we get same number of rows in the end
    print("Everything's fine")

Everything's fine


In [25]:
# Handling the "main" language first
data_path = my_path + "\\src\\main"
os.chdir(data_path)

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc,sep='\t')

# remember to pass only the unidecode version of the string to here
create_kg_csv(source_uni, relation_uni, target_uni, "simple", lan, triple_id)
print(time.time() - start_time)

1561.0669739246368


In [34]:
fix_duplicate("simple", lan) # removing duplicates

In [35]:
# adding to which concept each triple is related to
update_triple_csv_file_with_concepts("simple", lan, list(data_of_lan[lan.upper()]))

Currently at column: de_Subject
Currently at column: de_Predicate
Currently at column: de_Object


In [26]:
# turning list to a:b:c:d style instead of [a,b,c,d] because "," causes problem in neo4j
update_df_according_to_neo4j("simple", lan)

Currently at column: fr_Subject_concept
Currently at column: fr_Predicate_concept
Currently at column: fr_Object_concept


#### 5. Translate

In [137]:
simple_translator("de", "simple", extra_info = "translated") # CAUTION: since this uses API it can "timeout", so if happens just run it again

Starting from row: 0


In [138]:
simple_translator("fr", "simple", extra_info = "translated")

Starting from row: 0


#### 6. KG finalization for other languages
(this part has to be ran after the "translation" has been done)

In [15]:
# French
lan = "fr"

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc, sep='\t')
update_triple_csv_file_with_concepts("simple", lan, list(data_of_lan[lan.upper()]))
update_df_according_to_neo4j("simple", lan)

Currently at column: de_Subject
Row0, Row1000, Currently at column: de_Predicate
Row0, Row1000, Currently at column: de_Object
Row0, Row1000, Currently at column: de_Subject_concept
Currently at column: de_Predicate_concept
Currently at column: de_Object_concept


In [4]:
lan = "de"

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc, sep='\t')
update_triple_csv_file_with_concepts("predefined_dictionary", lan, list(data_of_lan[lan.upper()]), "stanford_OpenIE")
update_df_according_to_neo4j("predefined_dictionary", lan, "stanford_OpenIE")

Currently at column: de_Subject
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: de_Predicate
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: de_Object
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: de_Subject_concept
Currently at column: de_Predicate_concept
Currently at column: de_Object_concept


# STORAGE

In [6]:
# create a directed-graph from a dataframe
# directed_graph = nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
#
# from pyvis.network import Network
#
# net = Network(notebook=True, cdn_resources='remote')
#
# net.from_nx(directed_graph)
# net.show("example.html")

In [None]:
# count = 1
#
# for file in file_list:
#     print("Document number " + str(count))
#     count += 1
#
#     sents = list(file.sents) # extract sentences of the document (only checks for ".")
#
#     for i in range(30):
#         print("Sentence number: " + str(i))
#         print("The sentence: " + str(sents[i]))
#         print("Entities: " + str(get_entities(str(sents[i]))))
#         print("Relation: " + str(get_relation(str(sents[i]))))
#         print()

In [None]:
# from PyPDF2 import PdfReader
#
# # creating a pdf reader object
# reader = PdfReader("C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23\\data\\fr\\directives_pdf\\Directive_(EU)_2016_800_fr.pdf")
#
# all_text = ""
# for i in range(len(reader.pages)):
#     text = reader.pages[i].extract_text()
#     text = text.replace("\n", "")
#     all_text = all_text + text
#
# print(all_text)

In [None]:
# count = 1
# for sentence in list(doc.sents):
#     print("Sentence: " + str(count))
#     print(sentence)
#     print()
#     count += 1

In [None]:
# https://spacy.io/models/fr#fr_dep_news_trf
# french_list = ["ROOT", "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux:pass", "aux:tense", "case", "cc", "ccomp", "conj", "cop", "dep", "det", "expl:comp", "expl:pass", "expl:subj", "fixed", "flat:foreign", "flat:name", "iobj", "mark", "nmod", "nsubj", "nsubj:pass", "nummod", "obj", "obl:agent", "obl:arg", "obl:mod", "parataxis", "punct", "vocative", "xcomp"]
#
# # https://spacy.io/models/en#en_core_web_trf
# english_list = ["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"]
#
# # https://spacy.io/models/de#de_dep_news_trf
# german_list = ["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dep", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "svp", "uc", "vo"]
#
# for tk in german_list:
#     print(tk)
#     print(spacy.explain(tk))
#     print()

In [None]:
# def get_entities_for_english(the_file):
#     """
#     Here we extract the elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of parts of speech (POS) tags. However, when an entity spans across multiple words, then POS tags alone are not sufficient. To fix this we basically save our previous text's info.
#
#     requirements:
#     - punctuation: "punct"
#     - compound: "compound"
#     - modifier: ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
#     -
#     """
#
#     # init
#     ent1 = ""
#     ent2 = ""
#
#     prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
#     prv_tok_text = ""   # previous token in the sentence
#
#     # for holding the text that is associated with the current subject/object (can be multiple words)
#     prefix = ""
#     modifier = ""
#
#     # going through each token
#     for tok in nlp(the_file):
#         if "punct" != tok.dep_.lower(): # if punctuation mark skip
#
#             if "compound" == tok.dep_.lower():
#                 prefix = tok.text
#                 if "compound" == prv_tok_dep.lower(): # if the previous word was also a 'compound' then add to current text
#                     prefix = prv_tok_text + " "+ tok.text
#
#             # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
#             if tok.dep_.lower().endswith("mod"):
#                 modifier = tok.text
#                 if "compound" == prv_tok_dep.lower(): # if previous word was also a 'compound' then add to current text
#                     modifier = prv_tok_text + " "+ tok.text
#
#             # extract first entity - subject
#             if "subj" in tok.dep_.lower():
#                 ent1 = modifier +" "+ prefix + " "+ tok.text
#                 # reset info
#                 prefix = ""
#                 modifier = ""
#                 prv_tok_dep = ""
#                 prv_tok_text = ""
#
#             # extract second entity - object
#             if "obj" in tok.dep_.lower():
#                 ent2 = modifier +" "+ prefix +" "+ tok.text
#                 # reset info
#                 prefix = ""
#                 modifier = ""
#                 prv_tok_dep = ""
#                 prv_tok_text = ""
#
#             # update variables
#             prv_tok_dep = tok.dep_
#             prv_tok_text = tok.text
#
#     return [ent1.strip().lower(), ent2.strip().lower()]

In [None]:
# for file, doc_name in zip(file_list, doc_list):
#     print("Document number " + str(count))
#     count += 1
#
#     sents = list(file.sents) # extract sentences of the document (only checks for ".")
#     sents_pruned = [sentence for sentence in sents if len(sentence) > 5]
#
#     print("Entities (Nodes - subject/object) Extraction")
#     entity_pairs = [] # here "tqdm" is just used for creating a progress bar
#     # Extracting the entity pairs of each sentence.
#     if lan == "en": entity_pairs = [get_entities_for_english(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
#     elif lan == "fr": entity_pairs = [get_entities_for_french(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
#     elif lan == "de": entity_pairs = [get_entities_for_german(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
#     else: print("Current chosen language does not exist!")
#
#     print("Relations (Edges - predicate) Extraction")
#     relations = []
#     relations = [get_relation(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)] # Here we assume predicate is the main verb in a sentence.
#
#     triple_id.extend([doc_name for i in range(len(sents_pruned))])
#
#     # pd.Series(relations).value_counts()[:10] # to visualize
#     list_of_entity_pairs.append(entity_pairs)
#     list_of_relations.append(relations)

In [None]:
# old
# German: de, French: fr, but can work for any other languages too
# def simple_translator(target_language, re_type, model_used, extra_info = ""): # assumes that en kg csv is already created
#     translator = Translator()
#
#     filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
#     filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
#     filename_assigned_language = filename + "\\kg_of_" + re_type + "_GoogleTrans_" + target_language + ".csv"
#
#     with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "r+", newline = '') as file_2:
#         csv_reader = csv.reader(file_original)
#         csv_reader_file2 = csv.reader(file_2)
#         csv_writer = csv.writer(file_2)
#
#         current_size_of_new_kg = 0
#         for step in csv_reader_file2:
#             current_size_of_new_kg += 1
#         print("Starting from row: " + str(current_size_of_new_kg))
#
#         current_step = 0
#         for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
#             if current_step > current_size_of_new_kg:
#                 source_en, relation_en, target_en = lines[0], lines[1], lines[2]
#                 # src(source) = english, dest(destination) = language to translate to
#                 translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)
#
#                 temp_row = [translated_source.text, translated_relation.text, translated_target.text]
#                 index = 0
#                 for row in temp_row:
#                     temp_row[index] = row.replace(u'\u200b', '') # this "space" character gives error, but it just adds extra space so just removing it
#                     index += 1
#                 csv_writer.writerow(temp_row)
#             else:
#                 current_step += 1

In [None]:
# def create_kg_csv(subjects, predicates, objects, re_type, language, given_id_list):
#     """
#     [source(subject) --relation(predicate)--> target(object)]
#     :param subjects: source
#     :param predicates: relation
#     :param objects: target
#     :param re_type: currently, we have only relation extraction type of "simple" and "predefined-dictionary"
#     :param language: en, fr, de
#     :param given_id_list: represents the id of triple, in this case it's from which directive/legislation it came from
#     :return: returns nothing only creates the csv file
#     """
#     # field names
#     fields = [language + '_Subject', language + '_Predicate', language + '_Object', "Triple_ID"]
#     filename = os.getcwd() + "\\triples_data\\" + re_type + "\\kg_of_" + re_type +  "_" + language + ".csv"
#
#     rows = [[subjects[i], predicates[i], objects[i], given_id_list[i]] for i in range(len(subjects))]
#
#     # find out empty and None strings, replacing it with "-"
#     for i in range(len(rows)):
#         for j in range(len(rows[0])): # so len 3
#             if rows[i][j] == "" or rows[i][j] is None: rows[i][j] = "-"
#             rows[i][j] = unidecode(rows[i][j].lower())
#
#     # writing to csv file
#     with open(filename, 'w', newline = '') as csv_file:
#         csv_writer = csv.writer(csv_file)
#         csv_writer.writerow(fields) # first writing fields
#         csv_writer.writerows(rows) # now the remaining record
#
# # German: de, French: fr, but can work for any other languages too
# def simple_translator(target_language, re_type, model_used = "", extra_info = ""): # assumes that en kg csv is already created
#     translator = Translator()
#
#     if extra_info: extra_info = "_" + extra_info
#     if model_used:
#         filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
#         filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
#         filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"
#     else:
#         filename = os.getcwd() + "\\triples_data\\" + re_type
#         filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
#         filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"
#
#     with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "r+", newline = '') as file_2:
#         csv_reader = csv.reader(file_original)
#         csv_reader_file2 = csv.reader(file_2)
#         csv_writer = csv.writer(file_2)
#
#         current_size_of_new_kg = 0
#         for step in csv_reader_file2:
#             current_size_of_new_kg += 1
#         print("Starting from row: " + str(current_size_of_new_kg))
#
#         # first line is headers
#         if current_size_of_new_kg == 0: csv_writer.writerow([target_language + "_Subject", target_language + "_Predicate", target_language + "_Object", "Triple_ID"])
#         current_step = 0
#         for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
#             if current_step > current_size_of_new_kg:
#                 source_en, relation_en, target_en = lines[0], lines[1], lines[2]
#                 # src(source) = english, dest(destination) = language to translate to
#                 translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)
#
#                 temp_row = [unidecode(translated_source.text), unidecode(translated_relation.text), unidecode(translated_target.text), unidecode(lines[3])]
#                 csv_writer.writerow(temp_row)
#             else:
#                 current_step += 1
#
# def fix_duplicate(re_type, model_used, language):
#     path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
#     os.chdir(path)
#
#     data = pd.read_csv("kg_of_" + re_type + "_" + language + ".csv")
#     temp_data = data[~data.duplicated()]
#     target_path = path + "\\kg_of_" + re_type + "_" + language + ".csv"
#     print(target_path)
#     temp_data.to_csv(target_path, index = False)

In [None]:
# def folder_to_nlp_doc(given_path, list_of_doc_names, given_nlp):
#     temp_list = []
#     # the input should be a list of file contained in a folder
#     for file_name in list_of_doc_names:
#         file_path = given_path + "\\" + file_name
#         reader = PdfReader(file_path)
#
#         all_text = ""
#         for i in range(len(reader.pages)):
#             text = reader.pages[i].extract_text()
#             text = text.replace("\n", "")
#             all_text = all_text + text
#
#         temp_list.append(given_nlp(all_text))
#     return temp_list