In [2]:
# imports
import xml.etree.ElementTree as Xet # for parsing and creating XML data
import pandas as pd
import os, csv, re, nltk
from flair.data import Corpus # in order to use the functions tha flair has
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings # these embeddings helps NER to perform better
from itertools import islice
from nltk.stem import WordNetLemmatizer # previously need to download "nltk.download('wordnet')" and "nltk.download('omw-1.4')". But beware if new version comes out
from tqdm import tqdm # to display loop in a bar
from openie import StanfordOpenIE # for using our OIE tool
from nltk.tokenize import sent_tokenize
from googletrans import Translator # don't forget to run "!pip install googletrans==3.1.0a0" before using this
import nltk
from nltk.corpus import stopwords
from unidecode import unidecode
import time
import jaro # requires !pip install jaro-winkler
import numpy as np
from scipy import spatial
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dnaen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# run this cell if you aim to use semantic distance
# from gensim.models import Word2Vec
# import gensim.downloader as model
# corpus_of_word2vec = model.load("word2vec-google-news-300") # have to run it once to download to pc, in later runs it just takes the downloaded file from pc (so takes much shorter after second and future times)

### NER (Named-Entity Recognition)

In [2]:
# functions
# -*- coding: utf-8 -*-

# English: en, German: de, French: fr, ... -> creates the tsv of given descriptor of any language
def create_tsv_of_language(given_language):
    """
    Before running this function below, the "desc_"".xml" file (that is downloaded from EuroVoc website) needs to be downloaded and added to package "data/""/descriptors/..."
    PS: Even after this function finishes it takes some time for the new file to appear
    """
    cols = ['ID', given_language.upper()] # will be saving in a tsv with ids and their corresponding terms
    rows = []

    # parsing the xml file -> with the given EuroVoc descriptors
    temp_path = os.getcwd()
    temp_path = temp_path.replace("src\\main", "data\\" + given_language + "\\descriptors\\desc_" + given_language + ".xml")
    xml_parse = Xet.parse(temp_path)
    root = xml_parse.getroot()

    # iterate through the elements of xml file
    for element in root:
        rows.append({"ID": element.find("DESCRIPTEUR_ID").text, given_language.upper(): element.find("LIBELLE").text})

    # creating the tsv file
    df = pd.DataFrame(rows, columns=cols)
    df.to_csv("eurovoc_" + given_language + ".tsv", sep='\t', index=False) # using sep='\t' gives us a tsv file instead of csv

def create_tsv_of_any_given_concept(given_concept_dict, given_language):
    """
    :param given_concept_dict: A dictionary in style of .e.g, {EN:..., ID:...}
    :param given_language: en: english, fr: français, de: deutsch
    :return: creates a new tsv called "updated_eurovoc_en.tsv"
    """
    cols = ['ID', given_language.upper()] # will be saving in a tsv with ids and their corresponding terms
    rows = []

    # iterate through the elements of xml file
    for key, value in given_concept_dict.items():
        rows.append({"ID": value, given_language.upper(): key})

    # creating the tsv file
    df = pd.DataFrame(rows, columns=cols)
    df.to_csv("updated_eurovoc_" + given_language + ".tsv", sep='\t', index=False) # using sep='\t' gives us a tsv file instead of csv

# this function assumes we get the text annotated as [entity_value](entity_name), and assigns prefixes B, I, and 0 to each token
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces (but not splitting the space inside the square brackets (so not splitting the "multi-word" entity value yet))
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"

    # flags: re.IGNORECASE and re.MULTILINE
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M) # using it to compile a regular expression pattern provided as a string into a regex pattern object

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token) # if no match then returns None

        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O")) # no match

    return tokens_with_entities

# NLTK VERSION
# TODO add automatic noun-verb-... identifier to aid lemmatization
def regex_from_term_nltk(term, lemmatizer):
    regex = r"\b(" # Regex Opening
    tokensList = nltk.word_tokenize(term)

    # Adding terms to regex
    if len(tokensList) == 1: # in case of one-word term
        for token in tokensList:
            regex += token_cleaning(token, lemmatizer)

    else: # if it is a multi-word term
        decount = len(tokensList)
        for token in tokensList:
            decount = decount-1
            # add between-words
            if decount != len(tokensList)-1:
                regex+= r'\w*\W\w*\W*'
            # add token
            regex += token_cleaning(token, lemmatizer)

    regex += '''\w{0,5})(\W)''' # Regex Closure
    return regex

def token_cleaning(token, lemmatizer):
    token = token.lower()
    token = lemmatizer.lemmatize(token)
    return token

# Inspired from @https://github.com/shashankmc/eurovoc_entity_link/blob/master/EurovocTagger.py
def tsv_dic_processing(path):
    """
    :param path: the name of the eurovoc.tsv file
    :return: Dic: Dictionary in style of {ID: Word}
    :return: RevDic: Dictionary in style of {Word: ID}
    :return: list1: list of IDs
    :return: list2: list of words (concepts)
    """
    # Dic, RevDic, list1, list2
    # Only works with a 2-columns ([ID], [EN]) TSV file
    Dic = {}
    RevDic = {}
    list1 = []
    list2 = []
    with open(path, 'rt', encoding='utf8') as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        rcount = 0
        for row in myreader:
            rcount += 1
            ccount = 0
            if rcount > 1:
                for cells in row:
                    ccount += 1
                    if ccount ==1:
                        list1.append(cells)
                        key = cells
                    else:
                        list2.append(cells)
                        value = cells
                Dic[key] = value
                RevDic[value] = key
    return Dic, RevDic, list1, list2

def find_folder_with_type(given_path, doc_type): # returns all documents found in path
    doc_list = []
    for doc in os.listdir(given_path):
        if re.search (r'.*\%s$' % doc_type, doc) is not None: # even though this shows as error in IDE it's fine
            doc_list.append(doc)
    return doc_list


def folder_list_to_dic(given_path, given_list): # given file names extracts their texts and saves in a dic
    dic = {}
    old_path = os.getcwd() # saving the previous working dir so we can switch back to that dir later
    os.chdir(given_path)

    # the input should be a list of file contained in a folder
    for file_name in given_list:
        print('importing', file_name, '...')
        with open("%s" % file_name, "r", encoding='utf8') as my_file:
            text = my_file.read()
        dic[file_name]= text

    os.chdir(old_path)
    return dic

# tagging by researching concept-regexed as a substring of the text (by using NLTK)
def tagging_document(path_of_tagged, given_doc_list, given_doc_dic, given_concept_list, given_eurovoc_reverse_dic):
    """
    This function takes the information of the descriptor (e.g., {id:concept}, id list, concept list, ...) and then with the given document information it creates the new tagged document in tagged folder. Additionally, it returns the new updated concept list which contains additional "concepts" found in the document text that seems to be related to one of the original concepts. Thus, expanding the vocabulary we have.

    :param path_of_tagged: the location (dir) of the tagged folder
    :param given_doc_list: a list of names of the documents
    :param given_doc_dic: a dic that contains the contents of the document i.e. {doc_name: doc_text}
    :param given_concept_list: the original concept list downloaded from Eurovoc
    :param given_eurovoc_reverse_dic: opposite of "given_concept_list" so {concept: id}
    :return: new_concept: this is the new expanded concept list
    """
    lemmatizer = WordNetLemmatizer()
    old_path = os.getcwd() # saving the previous working dir so we can switch back to that dir later
    os.chdir(path_of_tagged)
    new_concept_dic = given_eurovoc_reverse_dic.copy() # using the reverse eurovoc dict instead because can't add words with same id

    for doc_name in given_doc_list:
        tags_list=[]
        tagged_text = ""
        print('tagging', doc_name,'...')
        text = given_doc_dic[doc_name]
        text = text.lower()
        tagged_text = text # document's initial text

        # a concept tag will be done with a star (*), and the identifier with a +
        for concept in given_concept_list:

            if concept != "": # if concept empty, will tag everything (so need to make sure that it's not empty)
                # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
                regex = regex_from_term_nltk(concept, lemmatizer)

                # concept = concept.strip()
                # TAGGING #
                # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
                if re.search(regex, text) is not None:
                    # these prints can be used to check performance
                    # print("Match made!")
                    # print("Found: " + re.search(regex, text).group() + ", for concept: " + concept)
                    match_in_text = re.search(regex, text).group()
                    if match_in_text not in given_concept_list:
                        # cleaning up the matched text
                        match_in_text = match_in_text.replace("\n", "")
                        # match_in_text = match_in_text.replace("[V4.3]", "")
                        match_in_text = match_in_text.strip()
                        match_in_text = match_in_text.strip(".,-")
                        new_concept_dic[match_in_text] = given_eurovoc_reverse_dic[concept]

                    tags_list.append(concept)
                    sub_regex = r"[" + concept + r"]"
                    sub_regex += r"(" + given_eurovoc_reverse_dic[concept] + r") " # insert the identifier
                    tagged_text = re.sub(regex, sub_regex, tagged_text)

    # create a new file with the tagged file
        file = open("%s_TAGGED.txt" % doc_name, "w", encoding='utf8')
        file.write(tagged_text)
        file.close()

    os.chdir(old_path) # change back to previous path

    return new_concept_dic

# updating the concept list by going through the given documents (by using NLTK)
def update_concept_list(given_doc_list, given_doc_dic, given_concept_list, given_eurovoc_reverse_dic):

    lemmatizer = WordNetLemmatizer()
    new_concept_dic = given_eurovoc_reverse_dic.copy() # using the reverse eurovoc dict instead because can't add words with same id

    for doc_name in given_doc_list:
        print('Going through ', doc_name,'...')
        text = given_doc_dic[doc_name]
        text = text.lower()

        # a concept tag will be done with a star (*), and the identifier with a +
        for concept in given_concept_list:

            if concept != "": # if concept empty, will tag everything (so need to make sure that it's not empty)
                # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
                regex = regex_from_term_nltk(concept, lemmatizer)

                # concept = concept.strip()
                # TAGGING #
                # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
                if re.search(regex, text) is not None:
                    # these prints can be used to check performance
                    # print("Match made!")
                    # print("Found: " + re.search(regex, text).group() + ", for concept: " + concept)
                    match_in_text = re.search(regex, text).group()
                    if match_in_text not in given_concept_list:
                        # cleaning up the matched text
                        match_in_text = match_in_text.replace("\n", "")
                        # match_in_text = match_in_text.replace("[V4.3]", "")
                        match_in_text = match_in_text.strip()
                        match_in_text = match_in_text.strip(".,-")
                        new_concept_dic[match_in_text] = given_eurovoc_reverse_dic[concept]

                    # tags_list.append(concept)

    return new_concept_dic

def find_corresponding_eurovoc_concept_basic_overlap(given_text, given_concept_list, given_stopwords_list):
    """
    The goal of this function is to find the concepts the given text might be linked to.
    :param given_text: takes in a string (this string can be a single word or a compound word (list of words)
    :param given_concept_list: the original concept list downloaded from Eurovoc (MAKE SURE THIS IS A LIST!)
    :param given_stopwords_list: a list words we don't want to look into (in our case it's stopwords)
    :return: a list of concepts it might be related to
    """
    elems_concept = []

    # such that we can also check on possible compounds by bruteforce
    possible_words_list = get_neighbouring_groups(given_text) # ex: "I need to" -> ["I", "need", "to, "I need", "need to", "I need to"]

    # TODO change the type of string measure you do
    for word in possible_words_list:
        word = word.lower()
        for concept in given_concept_list:
            concept = unidecode(concept)
            contains = False
            if len(word.split()) == len(concept.split()): # checking whether there is same number of words
                word_list = word.split()
                concept_list_1 = concept.split()

                contains = True # set it to false when a mismatch occurs
                for i in range(len(word_list)): # this was done such that we can go through compounds too (and applyiing specific operations (e.g., lemmatization) to each word)
                    # TODO add lemmatization?
                    current_word = word_list[i]
                    current_concept = concept_list_1[i]

                    if current_word not in given_stopwords_list:
                        concept_uni = unidecode(current_concept) # because the triples of other languages will be in unidecode
                        if current_word not in concept_uni:
                            contains = False
                    else: contains = False

            if contains: elems_concept.append(unidecode(concept))

    return elems_concept

def find_corresponding_eurovoc_concept_jaro_winkler(given_text, given_concept_list, given_stopwords_list):
    """
    The goal of this function is to find the concepts the given text might be linked to.
    :param given_text: takes in a string (this string can be a single word or a compound word (list of words)
    :param given_concept_list: the original concept list downloaded from Eurovoc (MAKE SURE THIS IS A LIST!)
    :param given_stopwords_list: a list words we don't want to look into (in our case it's stopwords)
    :return: a list of concepts it might be related to
    """
    elems_concept = []

    # such that we can also check on possible compounds by bruteforce
    possible_words_list = get_neighbouring_groups(given_text) # ex: "I need to" -> ["I", "need", "to, "I need", "need to", "I need to"]

    for word in possible_words_list:
        word = word.lower()
        if word not in given_stopwords_list:
            for concept in given_concept_list:
                concept = unidecode(concept)
                if len(word.split()) == len(concept.split()): # checking whether there is same number of words
                    if jaro.jaro_winkler_metric(word, concept) > 0.95: elems_concept.append(unidecode(concept))

    return elems_concept

def find_corresponding_eurovoc_concept_jaro_winkler_with_english_semantic(given_text, given_concept_list, given_stopwords_list):
    """
    The goal of this function is to find the concepts the given text might be linked to.
    :param given_text: takes in a string (this string can be a single word or a compound word (list of words)
    :param given_concept_list: the original concept list downloaded from Eurovoc (MAKE SURE THIS IS A LIST!)
    :param given_stopwords_list: a list words we don't want to look into (in our case it's stopwords)
    :return: a list of concepts it might be related to
    """
    elems_concept = []

    # such that we can also check on possible compounds by bruteforce
    possible_words_list = get_neighbouring_groups(given_text) # ex: "I need to" -> ["I", "need", "to, "I need", "need to", "I need to"]

    for word in possible_words_list:
        word = word.lower()
        if word not in given_stopwords_list:
            for concept in given_concept_list:
                concept = unidecode(concept)
                if len(word.split()) == len(concept.split()): # checking whether there is same number of words
                    # calculation of semantic distance and string distance
                    try: # if word/concept not exists in the corpus it throws KeyError, so we would use only string distance in this case
                        semantic_dist = 1 - spatial.distance.cosine(get_vector(word), get_vector(concept))
                        string_dist = jaro.jaro_winkler_metric(word, concept)

                        if semantic_dist > 0.7: elems_concept.append(unidecode(concept)) # higher priority to semantic dist
                        elif string_dist > 0.8: elems_concept.append(unidecode(concept))
                    except KeyError:
                        string_dist = jaro.jaro_winkler_metric(word, concept)
                        if string_dist > 0.8: elems_concept.append(unidecode(concept))

    return elems_concept

def preprocess(s):
    return [i.lower() for i in s.split()]

def get_vector(s):
    return np.sum(np.array([model[i] for i in preprocess(s)]), axis=0)

### Relations Extraction

In [18]:
# functions
def get_triples_stanford_openie(given_client, given_sentence):
    triples_list = []

    # returns dict in this style: {'subject': 'Obama', 'relation': 'was born in', 'object': 'Hawaii'}
    # for triple in tqdm(given_client.annotate(given_sentence)): # this can be used for debugging
    for triple in given_client.annotate(given_sentence):
        triples_list.append([triple.get("subject"), triple.get("relation"), triple.get("object")])

    return triples_list

def split_text_into_sentence(given_doc_names, given_docs):
    given_docs_tokenized = {}
    for doc_name in given_doc_names:
        text_of_doc = given_docs[doc_name]
        text_of_doc = text_of_doc.replace('\n',' ') # removing the newline string from text
        text_of_doc = text_of_doc.replace('\xad ','') # removing the hyphen used for line breaking

        text_of_doc_tokenized = sent_tokenize(text_of_doc)
        given_docs_tokenized[doc_name] = text_of_doc_tokenized

    return given_docs_tokenized

# this function is modified from "version1_simple" file
def create_kg_csv(subjects, predicates, objects, re_type, language, given_id_list, model_used = ""):
    """
    [source(subject) --relation(predicate)--> target(object)]
    :param subjects: source
    :param predicates: relation
    :param objects: target
    :param re_type: currently, we have only relation extraction type of "simple" and "predefined-dictionary"
    :param model_used: currently only "stanford_OpenIE"
    :param language: en, fr, de
    :param given_id_list: represents the id of triple, in this case it's from which directive/legislation it came from
    :return: returns nothing only creates the csv file
    """
    # field names
    fields = [language + '_Subject', language + '_Predicate', language + '_Object', "Triple_ID"]

    if model_used:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used + "\\kg_of_" + re_type + "_" + language + ".csv"
    else:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\kg_of_" + re_type + "_" + language + ".csv"



    rows = [[subjects[i], predicates[i], objects[i], given_id_list[i]] for i in range(len(subjects))]

    # find out empty and None strings, replacing it with "-"
    for i in range(len(rows)):
        for j in range(len(rows[0])): # so len 3
            if rows[i][j] == "" or rows[i][j] is None: rows[i][j] = "-"
            rows[i][j] = unidecode(rows[i][j].lower())

    # writing to csv file
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

# German: de, French: fr, but can work for any other languages too
def simple_translator(target_language, re_type, extra_info = "", model_used = ""): # assumes that en kg csv is already created
    translator = Translator()

    if model_used:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"
    else:
        filename = os.getcwd() + "\\triples_data\\" + re_type
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"

    with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "r+", newline = '') as file_2:
        csv_reader = csv.reader(file_original)
        csv_reader_file2 = csv.reader(file_2)
        csv_writer = csv.writer(file_2)

        current_size_of_new_kg = 0
        for step in csv_reader_file2:
            current_size_of_new_kg += 1
        print("Starting from row: " + str(current_size_of_new_kg))

        # first line is headers
        if current_size_of_new_kg == 0: csv_writer.writerow([target_language + "_Subject", target_language + "_Predicate", target_language + "_Object", "Triple_ID"])
        current_step = 0
        for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
            if current_step > current_size_of_new_kg:
                source_en, relation_en, target_en = lines[0], lines[1], lines[2]
                # src(source) = english, dest(destination) = language to translate to
                translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)

                temp_row = [unidecode(translated_source.text), unidecode(translated_relation.text), unidecode(translated_target.text), unidecode(lines[3])]
                csv_writer.writerow(temp_row)
            else:
                current_step += 1

# German: de, French: fr, but can work for any other languages too
def simple_translator_with_concept(target_language, re_type, given_concept_en, given_concept_na, extra_info = "", model_used = ""): # assumes that en kg csv is already created
    """
    :param target_language: language to translate to, e.g. fr, de, ...
    :param re_type: currently, we have only relation extraction type of "simple" and "predefined_dictionary"
    :param model_used: currently we only have "stanford_OpenIE" model
    :param given_concept_en: concept dataframe (as ["ID", "EN"]) of english
    :param given_concept_na: concept dataframe (as ["ID", ".."]) of target language
    :param extra_info: default is empty, but can be used to pass any information about the "file" to be initialized
    :return: creates a new translated KG with the use of concepts
    """
    translator = Translator()

    if model_used:
        filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"
    else:
        filename = os.getcwd() + "\\triples_data\\" + re_type
        filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
        filename_assigned_language = filename + "\\kg_of_" + re_type + "_" + target_language + extra_info + ".csv"

    list_of_concept_en = list(given_concept_en["EN"])
    target_language_capitalized = target_language.upper()
    testing = False

    with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "r+", newline = '') as file_2:
        csv_reader = csv.reader(file_original)
        csv_writer = csv.writer(file_2)
        csv_reader_file2 = csv.reader(file_2)

        current_size_of_new_kg = 0
        for step in csv_reader_file2:
            current_size_of_new_kg += 1
        print("Starting from row: " + str(current_size_of_new_kg))

        current_step = 0
        for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
            if current_step > current_size_of_new_kg:
                source_en, relation_en, target_en = lines[0], lines[1], lines[2]

                # src(source) = english, dest(destination) = language to translate to
                translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)

                translated_source = translated_source.text
                translated_relation = translated_relation.text
                translated_target = translated_target.text

                # if the text exists in concept list, then replace it with it's corresponding id
                if any(s in list_of_concept_en for s in (source_en, relation_en, target_en)):
                    if source_en in list_of_concept_en:
                        if testing: print("Source:")
                        if testing: print("English version: " + source_en)
                        if testing: print("Pre: " + translated_source)
                        concept_id = int(given_concept_en.loc[given_concept_en["EN"] == source_en]["ID"])
                        translated_source = given_concept_na.loc[given_concept_na["ID"] == concept_id, target_language_capitalized].item()
                        if testing:print("Pro: " + translated_source)

                    if relation_en in list_of_concept_en:
                        if testing: print("Relation:")
                        if testing: print("English version: " + relation_en)
                        if testing: print("Pre: " + translated_relation)
                        concept_id = int(given_concept_en.loc[given_concept_en["EN"] == relation_en]["ID"])
                        translated_relation = given_concept_na.loc[given_concept_na["ID"] == concept_id, target_language_capitalized].item()
                        if testing: print("Pro: " + translated_relation)

                    if target_en in list_of_concept_en:
                        if testing: print("Target")
                        if testing: print("English version: " + target_en)
                        if testing: print("Pre: " + translated_target)
                        concept_id = int(given_concept_en.loc[given_concept_en["EN"] == target_en]["ID"])
                        translated_target = given_concept_na.loc[given_concept_na["ID"] == concept_id, target_language_capitalized].item()
                        if testing: print("Pro: " + translated_target)

                temp_row = [translated_source, translated_relation, translated_target]

                index = 0
                for row in temp_row:
                    temp_row[index] = row.replace(u'\u200b', '') # this "space" character gives error, but it just adds extra space so just removing it
                    index += 1

                csv_writer.writerow(temp_row)
            else:
                current_step += 1

def get_translation_through_eurovoc(given_text, given_concept, given_concept_target):
    # given concepts need to be a dataframe as ["ID", ".."]

    list_of_concept = list(given_concept[given_concept.keys()[1]])
    if given_text in list_of_concept:
        itsID = given_concept["ID"].get(list_of_concept.index(given_text))
        corresponding_eurovoc = given_concept_target.loc[given_concept_target['ID'] == itsID][given_concept_target.keys()[1]].values[0]
        return corresponding_eurovoc

def get_unique_words_from_triples(given_list_of_triples):
    unique_subjects = []
    unique_relations = []
    unique_objects = []

    for triple in given_list_of_triples:
        if triple[0] not in unique_subjects: unique_subjects.append(triple[0])
        if triple[1] not in unique_relations: unique_relations.append(triple[1])
        if triple[2] not in unique_objects: unique_objects.append(triple[2])

    return unique_subjects, unique_relations, unique_objects

def get_neighbouring_groups(given_text):
    text_split = given_text.split()
    group = ""
    group_list = []

    # groups of 2, groups of 3, groups of 4, ...
    for i in range(len(text_split)):
        for j in range(len(text_split) - i):
            # group_list.append([(group := group + text_split[k]) if k == 1 else (group := group + text_split[k] + " ") for k in range(1, len(text_split) - 1)][-1])
            for k in range(j, j+i+1):
                if k == j+i: group = group + text_split[k]
                else: group = group + text_split[k] + " "
            group_list.append(group)
            group = ""
    return group_list

def create_combined_eurovoc(given_lans):
    complete_data = []
    fields = ['ID']

    # adding id
    data = pd.read_csv("eurovoc_" + given_lans[0] + ".tsv",sep='\t')
    data = data.sort_values(by=["ID"])
    complete_data.append(list(data.iloc[:, 0]))

    # adding the concepts
    for lan in given_lans:
        fields.append(lan.upper())
        data = pd.read_csv("eurovoc_" + lan + ".tsv",sep='\t') # TODO what about updated version?
        data = data.sort_values(by=["ID"])
        complete_data.append(list(data.iloc[:, 1]))


    for i in range(1, len(complete_data)):
        complete_data[i] = [unidecode(x.lower()) for x in complete_data[i]]

    rows = [[complete_data[0][i], complete_data[1][i], complete_data[2][i], complete_data[3][i]] for i in range(len(complete_data[0]))] # TODO find a way to make this line work for any given number of columns

    # writing to csv file
    filename = os.getcwd() + "\\combined_eurovoc.csv"
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

def update_triple_csv_file_with_concepts(re_type, language, given_concept_list, model_used = "", extra_info = ""):
    old_path = os.getcwd()
    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    if extra_info: extra_info = "_" + extra_info
    data = pd.read_csv("kg_of_" + re_type + "_" + language + extra_info + ".csv")

    col_list = list(data.columns)
    col_list.remove("Triple_ID")
    stopwords_list = []
    if language == "en": stopwords_list = stopwords.words('english')
    if language == "fr": stopwords_list = stopwords.words('french')
    if language == "de": stopwords_list = stopwords.words('german')

    for col in col_list:
        print("Currently at column: " + col)
        new_col = col + "_concept"
        temp_col = []
        old_col = data[col].copy()

        current_index = 0
        for cell in old_col:
            if current_index % 1000 == 0: print("Row" + str(current_index), end = ", ")
            if not (cell == "-"): temp_col.append(find_corresponding_eurovoc_concept_jaro_winkler(cell, given_concept_list, stopwords_list))
            else: temp_col.append("[]")
            current_index += 1

        data[new_col] = temp_col

    # writing to csv file
    file_name = path + "\\kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv"
    data.to_csv(file_name, index=False)
    os.chdir(old_path)

def update_df_according_to_neo4j(re_type, language, model_used = "", extra_info = ""):
    old_path = os.getcwd()

    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    if extra_info: extra_info = "_" + extra_info
    data = pd.read_csv("kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv")

    col_list = [language + "_Subject_concept", language + "_Predicate_concept", language + "_Object_concept"]

    for col in col_list:
        print("Currently at column: " + col)
        list_of_col = list(data[col])
        temp_col = []

        for cell in list_of_col:
            if str(cell) == ("[]" or None): temp_col.append("-")
            else: temp_col.append(str(cell.replace(", ", ":").replace("[", "").replace("]", "").replace("\"", "").replace("'", "")))

        data[col] = temp_col

    # writing to csv file
    file_name = path + "\\kg_of_" + re_type + "_with_concepts_" + language + extra_info + ".csv"
    data.to_csv(file_name, index=False)
    os.chdir(old_path)

def fix_duplicate(re_type, language, model_used = ""):
    old_path = os.getcwd()

    if model_used:
        path = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    else:
        path = os.getcwd() + "\\triples_data\\" + re_type

    os.chdir(path)

    data = pd.read_csv("kg_of_" + re_type + "_" + language + ".csv")
    temp_data = data[~data.duplicated()]
    target_path = path + "\\kg_of_" + re_type + "_" + language + ".csv"
    temp_data.to_csv(target_path, index = False)
    os.chdir(old_path)

def translate_text_with_google_api(given_text, target_lan):
    translator = Translator()
    translated = translator.translate(given_text, src = "en", dest = target_lan)
    return unidecode(translated.text)

# Pipeline

In [3]:
# your own project path dir here
start_time = time.time()
my_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23"
lan = "en"

#### 1.Importing Dataset

In [25]:
# this has to be ran only once, because it creates the eurovoc_en.tsv file (which should already be there)
# create_tsv_of_language("en")

In [26]:
# to make sure that we are in the original working directory
data_path = my_path + "\\src\\main"
os.chdir(data_path)
print(os.getcwd()) # this should return something like "...\src\main"

C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\src\main


In [27]:
tsv_file = "eurovoc_" + lan + ".tsv"

# getting info of ids and concepts from the tsv file
eurovoc_dic, eurovoc_reverse_dic, id_list, concept_list = tsv_dic_processing(tsv_file)
print('Eurovoc imported')

Eurovoc imported


In [28]:
# Extracting all existing txt documents in the path
data_path = my_path + "\\data\\" + lan + "\\directives_txt"
document_name_list = find_folder_with_type(data_path, '.txt') # detection of txt files in the folder
document_dic = folder_list_to_dic(data_path, document_name_list) # storing document content in a dictionary

importing Directive_(EU)_2016_1919_en.txt ...
importing Directive_(EU)_2016_343_en.txt ...
importing Directive_(EU)_2016_800_en.txt ...
importing Directive_2010_64_EU_en.txt ...
importing Directive_2012_13_EU_en.txt ...
importing Directive_2013_48_EU_en.txt ...


#### 2. Entity Extraction (NER)

In [29]:
# tagging document
# data_path = my_path + "\\data\\" + lan + "\\directives_txt_tagged"
updated_concept_list = update_concept_list(document_name_list, document_dic, concept_list, eurovoc_reverse_dic)

Going through  Directive_(EU)_2016_1919_en.txt ...
Going through  Directive_(EU)_2016_343_en.txt ...
Going through  Directive_(EU)_2016_800_en.txt ...
Going through  Directive_2010_64_EU_en.txt ...
Going through  Directive_2012_13_EU_en.txt ...
Going through  Directive_2013_48_EU_en.txt ...


In [30]:
# have to run this only once (if file "updated_eurovoc_en.tsv" exists no need to run it)
create_tsv_of_any_given_concept(updated_concept_list, lan)

#### 3. Relation Extraction (RE)

In [31]:
# run it once
# !pip install stanford_openie

In [32]:
# https://stanfordnlp.github.io/CoreNLP/openie.html#api
# Default value of openie.affinity_probability_cap was 1/3.
properties = {
    'openie.affinity_probability_cap': 2 / 3,
}

In [33]:
document_dic_tokenized = split_text_into_sentence(document_name_list, document_dic)
list_of_triples = []
triple_id = []
testing = False

with StanfordOpenIE(properties=properties) as client: # opening the server
    if testing: print()
    for doc_name in document_name_list:
        for sentence in document_dic_tokenized[doc_name]:
            if testing: print("Sentence")
            if testing: print(sentence)
            current_triple_list_of_sentence = get_triples_stanford_openie(client, sentence) # returns [subject, relation, object]
            if current_triple_list_of_sentence: # only run the loop if list not empty
                index = 0
                for current_triple in current_triple_list_of_sentence: # a sentence can have multiple triples
                    if testing: print("Triple " + str(index) + ": " + str(current_triple))
                    list_of_triples.append(current_triple)
                    index += 1

                triple_id.extend([doc_name] * len(current_triple_list_of_sentence))
            if testing: print()
        print("Document '" + doc_name + "' finished")


Starting server with command: java -Xmx8G -cp C:\Users\dnaen\.stanfordnlp_resources\stanford-corenlp-4.5.3/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-bded9fa9149844ab.props -preload openie
Document 'Directive_(EU)_2016_1919_en.txt' finished
Document 'Directive_(EU)_2016_343_en.txt' finished
Document 'Directive_(EU)_2016_800_en.txt' finished
Document 'Directive_2010_64_EU_en.txt' finished
Document 'Directive_2012_13_EU_en.txt' finished
Document 'Directive_2013_48_EU_en.txt' finished


In [34]:
# extract subject
source = [i[0] for i in list_of_triples]

# extract relation
relation = [i[1] for i in list_of_triples]

# extract object
target = [i[2] for i in list_of_triples]

#### 4. Build Knowledge Graph

In [35]:
create_combined_eurovoc(["en", "fr", "de"]) # has to be ran only once

In [36]:
# Handling the "main" language first
data_path = my_path + "\\src\\main"
os.chdir(data_path)

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc,sep='\t')

create_kg_csv(source, relation, target, "predefined_dictionary", lan, triple_id, "stanford_OpenIE")
print(time.time() - start_time)

253.50349164009094


In [37]:
fix_duplicate("predefined_dictionary", lan, "stanford_OpenIE") # removing duplicates

In [47]:
# adding to which concept each triple is related to
data_path = my_path + "\\src\\main"
os.chdir(data_path)
update_triple_csv_file_with_concepts("predefined_dictionary", lan, list(data_of_lan[lan.upper()]), "stanford_OpenIE")

Currently at column: en_Subject
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: en_Predicate
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: en_Object
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, 

In [48]:
# turning list to a:b:c:d style instead of [a,b,c,d] because "," causes problem in neo4j
update_df_according_to_neo4j("predefined_dictionary", lan, "stanford_OpenIE")

Currently at column: en_Subject_concept
Currently at column: en_Predicate_concept
Currently at column: en_Object_concept


#### 5. Translate


In [6]:
# creating french KG
simple_translator("fr", "predefined_dictionary", model_used = "stanford_OpenIE") # CAUTION: since this uses API it can "timeout", so if happens just run it again (as the translator just starts from the place it has left it's fine)

Starting from row: 5689


In [9]:
# creating german KG
simple_translator("de", "predefined_dictionary", model_used = "stanford_OpenIE")

Starting from row: 4301


#### 6. KG finalization for other languages
(this part has to be ran after the "translation" has been done)


In [4]:
# my_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23"
# data_path = my_path + "\\src\\main"
# os.chdir(data_path)

lan = "fr"

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc, sep='\t')
update_triple_csv_file_with_concepts("predefined_dictionary", lan, list(data_of_lan[lan.upper()]), "stanford_OpenIE")
update_df_according_to_neo4j("predefined_dictionary", lan, "stanford_OpenIE")

Currently at column: fr_Subject
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: fr_Predicate
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: fr_Object
Row0, Row1000, Row2000, Row3000, Row4000, Row5000, Row6000, Row7000, Row8000, Row9000, Row10000, Row11000, Row12000, Currently at column: fr_Subject_concept
Currently at column: fr_Predicate_concept
Currently at column: fr_Object_concept


In [None]:
lan = "de"

path_to_eurovoc = "eurovoc_" + lan + ".tsv"
data_of_lan = pd.read_csv(path_to_eurovoc, sep='\t')
update_triple_csv_file_with_concepts("predefined_dictionary", lan, list(data_of_lan[lan.upper()]), "stanford_OpenIE")
update_df_according_to_neo4j("predefined_dictionary", lan, "stanford_OpenIE")

# Storage

In [0]:
# nltk.download('punkt') # unsupervised trainable model, which means it can be trained on unlabeled data (Data that has not been tagged with information identifying its characteristics, properties, or categories is referred to as unlabeled data.)

In [72]:
# To check what new concepts were added

# temp_data_of_en_old = pd.read_csv('eurovoc_en.tsv',sep='\t')
# temp = list(updated_concept_list.values())
# my_dict = {i: temp.count(i) for i in temp}
# multiple_elements = []
# for key, value in my_dict.items():
#     if value > 1:
#         multiple_elements.append(key)
#
# for elem in multiple_elements:
#     value = {i for i in updated_concept_list if updated_concept_list[i] == elem}
#
#     for val in value:
#         if val in list(temp_data_of_en_old["EN"]): print("Old one is: " + val)
#
#     print(str(value) + " in id: " + elem)
#     print()

In [1]:
# To try out triple extraction

# text = "How does the principle of mutual recognition in criminal matters rely on trust among Member States and the protection of suspects' rights as outlined in relevant articles and directives?"
#
# properties = {
#     'openie.affinity_probability_cap': 2 / 3,
# }
#
# list_of_triples = []
#
# with StanfordOpenIE(properties=properties) as client: # opening the server
#     current_triple_list_of_sentence = get_triples_stanford_openie(client, text) # returns [subject, relation, object]
#     if current_triple_list_of_sentence: # only run the loop if list not empty
#         index = 0
#         for current_triple in current_triple_list_of_sentence: # a sentence can have multiple triples
#             list_of_triples.append(current_triple)
#             index += 1
#
# a, b, c = get_unique_words_from_triples(list_of_triples)
# print("Subjects to look into:")
# print(a)
# print()
# print("Relations to look into:")
# print(b)
# print()
# print("Objects to look into:")
# print(c)

In [None]:
# To check whether grouping function works

# temp_text = "protection of suspect 's procedural rights regarding specific mechanisms"
# temp_text = temp_text.split()
# temp_group = ""
# group_list = []
# # groups of 2, groups of 3, groups of 4, ...
#
# for i in range(len(temp_text)):
#     print()
#     print("Groups of ", i+1)
#     for j in range(len(temp_text) - i):
#         for k in range(j, j+i+1):
#             if k == j+i: temp_group = temp_group + temp_text[k]
#             else: temp_group = temp_group + temp_text[k] + " "
#             print(temp_text[k], end=" ")
#         group_list.append(temp_group)
#         temp_group = ""
#         print("", end=" | ")

In [None]:
# for word in possible_words_list:
#         word = word.lower()
#         if word not in given_stopwords_list:
#             for concept in given_concept_list:
#                 concept_uni = unidecode(concept) # because the triples of other languages will be in unidecode
#                 if word in concept_uni and len(word.split()) == len(concept_uni.split()):
#                     print("Word: ", word)
#                     print("Found concept ", concept_uni)
#                     print()
#                     elems_concept.append(concept_uni)
#
#     return elems_concept

In [None]:
# To check whether "find_corresponding_eurovoc_concept" works

# --------------
# text = "Can the prosecution guarantee the protection of suspect's procedural rights regarding the specific mechanisms and common minimum standards that are in place, such as the Charter of Fundamental Rights and the European Convention on Human Rights?"
#
#
# properties = {
#     'openie.affinity_probability_cap': 2 / 3,
# }
#
# list_of_triples = []
#
# with StanfordOpenIE(properties=properties) as client: # opening the server
#     current_triple_list_of_sentence = get_triples_stanford_openie(client, text) # returns [subject, relation, object]
#     if current_triple_list_of_sentence: # only run the loop if list not empty
#         index = 0
#         for current_triple in current_triple_list_of_sentence: # a sentence can have multiple triples
#             list_of_triples.append(current_triple)
#             index += 1
#
# temp_triples = list_of_triples

# ----------------------------


# for triple in list_of_triples:
#     print("Triple")
#     print(triple)
#     print("Found concepts")
#     for elem in triple:
#         a = find_corresponding_eurovoc_concept(elem, concept_list, stopwords.words('english'))
#         print(a, end = ", ")
#
#     print()
#     print()

In [None]:
# for testing the speed of "get_neighbouring_groups" function

# start_time = time.time()
# text = "Can the prosecution guarantee the protection of suspect's procedural rights regarding the specific mechanisms and common minimum standards that are in place, such as the Charter of Fundamental Rights and the European Convention on Human Rights. Can the prosecution guarantee the protection of suspect's procedural rights regarding the specific mechanisms and common minimum standards that are in place, such as the Charter of Fundamental Rights and the European Convention on Human Rights."
#
# a = get_neighbouring_groups(text)
#
# end_time = time.time()
# print(end_time - start_time)
# print()
# print(len(a))
# # 0.06703352928161621 # 2628

In [None]:
# saving the already created concepts
# if it's not, remember you can use "create_tsv_of_language(given_language)" or "create_tsv_of_any_given_concept(given_concept_dict, given_language)" to create them
# data_of_en = pd.read_csv('updated_eurovoc_en.tsv',sep='\t')
# data_of_de = pd.read_csv('eurovoc_de.tsv',sep='\t')
# data_of_fr = pd.read_csv('eurovoc_fr.tsv',sep='\t')

In [None]:
# for testing

# my_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23\\src\\main"
# os.chdir(my_path)
#
# data_of_en = pd.read_csv('updated_eurovoc_en.tsv',sep='\t')
# lan = "en"
#
# data_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23\\src\\main\\triples_data\\predefined_dictionary\\stanford_OpenIE"
# os.chdir(data_path)
# temp_data = pd.read_csv("kg_of_predefined_dictionary_with_concepts_en.csv")



In [2]:
# TO CHECK THE PERFORMANCE OF THE STRING MEASURES

# To check whether "find_corresponding_eurovoc_concept" works

# text = "What efforts have the written directives undertaken to safeguard children who are suspects or accused individuals?"
#
#
# properties = {
#     'openie.affinity_probability_cap': 2 / 3,
# }
#
# list_of_triples = []
#
# with StanfordOpenIE(properties=properties) as client: # opening the server
#     current_triple_list_of_sentence = get_triples_stanford_openie(client, text) # returns [subject, relation, object]
#     if current_triple_list_of_sentence: # only run the loop if list not empty
#         index = 0
#         for current_triple in current_triple_list_of_sentence: # a sentence can have multiple triples
#             list_of_triples.append(current_triple)
#             index += 1
#
# temp_triples = list_of_triples
#
# lan = "en"
# tsv_file = "eurovoc_" + lan + ".tsv"
#
# # getting info of ids and concepts from the tsv file
# eurovoc_dic, eurovoc_reverse_dic, id_list, concept_list = tsv_dic_processing(tsv_file)
#
# for triple in list_of_triples:
#     print("Triple")
#     print(triple)
#     print("Found concepts")
#     for elem in triple:
#         a = find_corresponding_eurovoc_concept_jaro_winkler_with_english_semantic(elem, concept_list, stopwords.words('english'))
#         print(a, end = ", ")
#
#     print()
#     print()

In [4]:
# TO CHECK PERFORMANCE OF SEMANTIC MATCH
from gensim.models import Word2Vec
# import gensim.downloader as api
# # TODO train word2vec
# model = api.load("word2vec-google-news-300") # have to run it once to download to pc, in later runs it just takes the downloaded file from pc (so takes much shorter after second and future times)


In [20]:
# continuation
# import numpy as np
# from scipy import spatial
# s0 = 'place'
# s1 = 'plate'
#
#
# def preprocess(s):
#     return [i.lower() for i in s.split()]
#
# def get_vector(s):
#     return np.sum(np.array([model[i] for i in preprocess(s)]), axis=0)
#
# print('s0 vs s1 ->',1 - spatial.distance.cosine(get_vector(s0), get_vector(s1)))

s0 vs s1 -> 0.07994840294122696


In [None]:
# for observing the translation
#
# subject = ['principle', 'mutual recognition']
# relation = ['does rely as', 'does rely on', 'is in']
# objects = ['as outlined relevant articles', 'as outlined articles', 'trust', 'trust among Member States', 'criminal matters']
#
# lan = "de"
# print("Subject")
# for elem in subject:
#     print(translate_text_with_google_api(elem, lan))
#
# print()
# print("Relation")
# for elem in relation:
#     print(translate_text_with_google_api(elem, lan))
#
# print()
# print("Object")
# for elem in objects:
#     print(translate_text_with_google_api(elem, lan))