In [1]:
# imports
import pandas as pd
import spacy
from spacy.matcher import Matcher
import networkx as nx
from tqdm import tqdm
import os
import re
import csv
from googletrans import Translator # don't forget to run "!pip install googletrans==3.1.0a0" before using this
import spacy_transformers # might seem not being used but it is required to run the transformers
from unidecode import unidecode
from PyPDF2 import PdfReader

In [21]:
# functions
def find_folder_with_type(given_path, doc_type): # returns all documents found in path
    doc_list = []
    for doc in os.listdir(given_path):
        if re.search (r'.*\%s$' % doc_type, doc) is not None: # even though this shows as error in IDE it's fine
            doc_list.append(doc)
    return doc_list

def folder_to_nlp_doc(given_path, list_of_doc_names, given_nlp):
    temp_list = []
    # the input should be a list of file contained in a folder
    for file_name in list_of_doc_names:
        file_path = given_path + "\\" + file_name
        reader = PdfReader(file_path)

        all_text = ""
        for i in range(len(reader.pages)):
            text = reader.pages[i].extract_text()
            text = text.replace("\n", "")
            all_text = all_text + text

        temp_list.append(given_nlp(all_text))
    return temp_list

def get_entities_for_english(the_file):
    """
    Here we extract the elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of parts of speech (POS) tags. However, when an entity spans across multiple words, then POS tags alone are not sufficient. To fix this we basically save our previous text's info.

    requirements:
    - punctuation: ["punct"]
    - compound: ["compound", "acomp", "ccomp", "pcomp"]
    - modifier: ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
    - subject: ["csubj", "csubjpass", "nsubj", "nsubjpass"]
    - object: ["dobj", "oprd", "pobj"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["compound", "acomp", "ccomp", "pcomp"]
    modifier_parsers = ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
    subject_parsers = ["csubj", "csubjpass", "nsubj", "nsubjpass"]
    object_parsers = ["dobj", "oprd", "pobj"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

def get_entities_for_french(the_file):
    """
    requirements:
    - punctuation = ["punct"]
    - compound: ["ccomp", "expl:comp", "xcomp", "fixed", "flat:foreign", "flat:name"]
    - modifier = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "nmod", "nummod", "obl:mod"]
    - subject = ["expl:subj", "nsubj", "nsubj:pass"]
    - object = ["iobj", "obj"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["ccomp", "expl:comp", "xcomp", "fixed", "flat:foreign", "flat:name"]
    modifier_parsers = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "nmod", "nummod", "obl:mod"]
    subject_parsers = ["expl:subj", "nsubj", "nsubj:pass"]
    object_parsers = ["iobj", "obj"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

def get_entities_for_german(the_file):
    """
    requirements:
    - punctuation = ["punct"]
    - compound = ["adc", "avc", "nmc", "pnc", "uc", "svp", "re", "pm", "par", "dep", "cvc", "app", "ag", "ac"]
    - modifier = ["ams", "mnr", "mo"]
    - subject = ["sb", "sbp"]
    - object = ["oa", "oc", "og", "op"]
    """

    # params
    punctuation_parsers = ["punct"]
    compound_parsers = ["adc", "avc", "nmc", "pnc", "uc", "svp", "re", "pm", "par", "dep", "cvc", "app", "ag", "ac"]
    modifier_parsers = ["ams", "mnr", "mo"]
    subject_parsers = ["sb", "sbp"]
    object_parsers = ["oa", "oc", "og", "op"]

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if tok.dep_ not in punctuation_parsers: # if punctuation mark skip

            if tok.dep_ in compound_parsers:
                prefix = tok.text
                if prv_tok_dep in compound_parsers: # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_ in modifier_parsers:
                modifier = tok.text
                if prv_tok_dep in compound_parsers: # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if tok.dep_ in subject_parsers:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if tok.dep_ in object_parsers:
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

def get_relation(the_sentence):
    temp_doc = nlp(the_sentence)

    # creating the rule-based Matcher object
    matcher = Matcher(nlp.vocab)

    # defining the pattern - Each pattern should be a list of dicts and each pattern should be saved in another list
    # ex: patterns = [[{"LOWER": "hello"}, {"LOWER": "world"}], [{"ORTH": "Google"}, {"ORTH": "Maps"}]]

    # This pattern tries to find the ROOT word in the sentence. Once the ROOT is identified, then the pattern checks whether it is followed by a preposition (‘prep’) or an agent word. If yes, then it is added to the ROOT word.
    pattern = [{'DEP':'ROOT'}, # check for token with dependency label root
            {'DEP':'prep','OP':"?"}, # other stuff
            {'DEP':'agent','OP':"?"},
            {'POS':'ADJ','OP':"?"}]

    # matcher.add("match_id", "patterns")
    matcher.add("matching_1", [pattern])

    matches = matcher(temp_doc)

    k = len(matches) - 1
    if k == -1: # meaning no match was found so return null
        return None

    span = temp_doc[matches[k][1]:matches[k][2]]
    return span.text

# def create_kg_csv_pandas(subjects, predicates, objects, re_type, language): # so source (subject) ----relation (predicate)----> target (object)
#     # field names
#     fields = [language + '_Subject', language + '_Predicate', language + '_Object']
#     filename = os.getcwd() + "\\triples_data\\" + re_type + "\\kg_of_" + re_type + "_" + language + ".csv"
#     # data rows of csv file
#     rows = [[subjects[i], predicates[i], objects[i]] for i in range(len(subjects))] # best to check if we have empty values
#
#     temp_df = pd.DataFrame(rows, columns = fields)
#     temp_df.to_csv(filename, header = fields)

def create_kg_csv(subjects, predicates, objects, re_type, language, given_id_list):
    """
    [source(subject) --relation(predicate)--> target(object)]
    :param subjects: source
    :param predicates: relation
    :param objects: target
    :param re_type: currently, we have only relation extraction type of "simple" and "predefined-dictionary"
    :param language: en, fr, de
    :param given_id_list: represents the id of triple, in this case it's from which directive/legislation it came from
    :return: returns nothing only creates the csv file
    """
    # field names
    fields = [language + '_Subject', language + '_Predicate', language + '_Object', "Triple_ID"]
    filename = os.getcwd() + "\\triples_data\\" + re_type + "\\kg_of_" + re_type +  "_" + language + ".csv"

    rows = [[subjects[i], predicates[i], objects[i], given_id_list[i]] for i in range(len(subjects))]

    # find out empty and None strings, replacing it with "-"
    for i in range(len(rows)):
        for j in range(len(rows[0])): # so len 3
            if rows[i][j] == "" or rows[i][j] is None: rows[i][j] = "-"

    # writing to csv file
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

# German: de, French: fr, but can work for any other languages too
def simple_translator(target_language, re_type, model_used): # assumes that en kg csv is already created
    translator = Translator()

    filename = os.getcwd() + "\\triples_data\\" + re_type + "\\" + model_used
    filename_english = filename + "\\kg_of_" + re_type + "_en.csv"
    filename_assigned_language = filename + "\\kg_of_" + re_type + "_GoogleTrans_" + target_language + ".csv"

    with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "r+", newline = '') as file_2:
        csv_reader = csv.reader(file_original)
        csv_reader_file2 = csv.reader(file_2)
        csv_writer = csv.writer(file_2)

        current_size_of_new_kg = 0
        for step in csv_reader_file2:
            current_size_of_new_kg += 1
        print("Starting from row: " + str(current_size_of_new_kg))

        current_step = 0
        for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
            if current_step > current_size_of_new_kg:
                source_en, relation_en, target_en = lines[0], lines[1], lines[2]
                # src(source) = english, dest(destination) = language to translate to
                translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = target_language), translator.translate(relation_en, src = "en", dest = target_language), translator.translate(target_en, src = "en", dest = target_language)

                temp_row = [translated_source.text, translated_relation.text, translated_target.text]
                index = 0
                for row in temp_row:
                    temp_row[index] = row.replace(u'\u200b', '') # this "space" character gives error, but it just adds extra space so just removing it
                    index += 1
                csv_writer.writerow(temp_row)
            else:
                current_step += 1

# PIPELINE

In [74]:
# your own project path dir here
my_path = "C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23"

#### 1. Loading Pipeline

In [75]:
# for downloading the pipeline
# !python -m spacy download fr_dep_news_trf
# !python -m spacy download de_dep_news_trf
# !python -m spacy download en_core_web_trf
# !pip install spacy-transformers # needed to install this to run trf

In [83]:
lan = "de" # options: en, fr, de
nlp = spacy.load("de_dep_news_trf") # English transformer pipeline (roberta-base)

#### 2. Importing Dataset

In [84]:
### Importing Dataset ###

# automatically extracting path
current_path = os.getcwd()
data_path = my_path + "\\data\\" + lan + "\\directives_pdf"

doc_list = find_folder_with_type(data_path, '.pdf') # detection of pdf files in the folder

# importing the file, here doc is like a "list" of tokens (each tok is either a word, number, ...)
file_list = folder_to_nlp_doc(data_path, doc_list, nlp)

#### 3. Entities (Nodes) and Relations (Edges) Extraction

In [85]:
list_of_entity_pairs = [] # this is a list of lists (so each index contains the info of a data file)
list_of_relations = []
triple_id = []
count = 1

for file, doc_name in zip(file_list, doc_list):
    print("Document number " + str(count))
    count += 1

    sents = list(file.sents) # extract sentences of the document (only checks for ".")
    sents_pruned = [sentence for sentence in sents if len(sentence) > 5]

    print("Entities (Nodes - subject/object) Extraction")
    entity_pairs = []

    # Extracting the entity pairs of each sentence.
    if lan == "en": entity_pairs = [get_entities_for_english(str(sent)) for sent in sents_pruned]
    elif lan == "fr": entity_pairs = [get_entities_for_french(str(sent)) for sent in sents_pruned]
    elif lan == "de": entity_pairs = [get_entities_for_german(str(sent)) for sent in sents_pruned]
    else: print("Current chosen language does not exist!")

    print("Relations (Edges - predicate) Extraction")
    relations = [get_relation(str(sent)) for sent in sents_pruned] # Here we assume predicate is the main verb in a sentence.

    # pd.Series(relations).value_counts()[:10] # to visualize
    triple_id.extend([doc_name] * len(sents_pruned))
    list_of_entity_pairs.extend(entity_pairs)
    list_of_relations.extend(relations)

Document number 1
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 2
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 3
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 4
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 5
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction
Document number 6
Entities (Nodes - subject/object) Extraction
Relations (Edges - predicate) Extraction


#### 4. Build Knowledge Graph

In [86]:
# extract subject
source = [i[0] for i in list_of_entity_pairs]

# extract object
target = [i[1] for i in list_of_entity_pairs]

In [87]:
print(source[0:10])

['', '', '', 'personen', '', '', 'gegen  verfahren', '', 'richtlinie', '']


In [88]:
temp_source = []
temp_relation = []
temp_target = []

source_uni = []
relation_uni = []
target_uni = []

# for removing the soft hyphen (but can be used for removal of other stuff too)
# this has to be done before translating to unicode because otherwise it can't detect it for some reason
for i in range(len(source)):
    temp_source.append(source[i].replace(" ­", "").replace("  ", " "))
    if list_of_relations[i] is not None: temp_relation.append(list_of_relations[i].replace(" ­", "").replace("  ", " "))
    else: temp_relation.append(list_of_relations[i])
    temp_target.append(target[i].replace(" ­", "").replace("  ", " "))

# for making it usable in english text
for i in range(len(temp_source)):
    source_uni.append(unidecode(temp_source[i]))
    if temp_relation[i] is not None: relation_uni.append(unidecode(temp_relation[i])) # replacing None with "-"
    else: relation_uni.append("-")
    target_uni.append(unidecode(temp_target[i]))


if len(source_uni) == len(relation_uni) == len(target_uni): # making sure we get same number of rows in the end
    print("Everything's fine")

Everything's fine


In [89]:
# kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
create_kg_csv(source_uni, relation_uni, target_uni, "simple", lan, triple_id)

#### 5. Translate

In [7]:
# TODO REGEX SEARCH
# need to do some kind of regex search here
simple_translator("de", "simple") # CAUTION: since this uses API it can "timeout", so if happens just run it again

In [8]:
simple_translator("fr", "simple")

# TRASH

In [9]:
"""We will use the networkx library to create a network from this dataframe. Which is going to be a directed graph allowing us to draw a line from subject to object. However, we also need to use pyvis library because networkx's visualize method is currently not working."""

# create a directed-graph from a dataframe
# directed_graph = nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
#
# from pyvis.network import Network
#
# net = Network(notebook=True, cdn_resources='remote')
#
# net.from_nx(directed_graph)
# net.show("example.html")

NameError: name 'kg_df' is not defined

In [None]:
# count = 1
#
# for file in file_list:
#     print("Document number " + str(count))
#     count += 1
#
#     sents = list(file.sents) # extract sentences of the document (only checks for ".")
#
#     for i in range(30):
#         print("Sentence number: " + str(i))
#         print("The sentence: " + str(sents[i]))
#         print("Entities: " + str(get_entities(str(sents[i]))))
#         print("Relation: " + str(get_relation(str(sents[i]))))
#         print()

In [None]:
# from PyPDF2 import PdfReader
#
# # creating a pdf reader object
# reader = PdfReader("C:\\Users\\dnaen\\PycharmProjects\\bachelor_thesis_23\\data\\fr\\directives_pdf\\Directive_(EU)_2016_800_fr.pdf")
#
# all_text = ""
# for i in range(len(reader.pages)):
#     text = reader.pages[i].extract_text()
#     text = text.replace("\n", "")
#     all_text = all_text + text
#
# print(all_text)

In [None]:
# count = 1
# for sentence in list(doc.sents):
#     print("Sentence: " + str(count))
#     print(sentence)
#     print()
#     count += 1

In [None]:
# https://spacy.io/models/fr#fr_dep_news_trf
french_list = ["ROOT", "acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux:pass", "aux:tense", "case", "cc", "ccomp", "conj", "cop", "dep", "det", "expl:comp", "expl:pass", "expl:subj", "fixed", "flat:foreign", "flat:name", "iobj", "mark", "nmod", "nsubj", "nsubj:pass", "nummod", "obj", "obl:agent", "obl:arg", "obl:mod", "parataxis", "punct", "vocative", "xcomp"]

# https://spacy.io/models/en#en_core_web_trf
english_list = ["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"]

# https://spacy.io/models/de#de_dep_news_trf
german_list = ["ROOT", "ac", "adc", "ag", "ams", "app", "avc", "cc", "cd", "cj", "cm", "cp", "cvc", "da", "dep", "dm", "ep", "ju", "mnr", "mo", "ng", "nk", "nmc", "oa", "oc", "og", "op", "par", "pd", "pg", "ph", "pm", "pnc", "punct", "rc", "re", "rs", "sb", "sbp", "svp", "uc", "vo"]

for tk in german_list:
    print(tk)
    print(spacy.explain(tk))
    print()

In [None]:
def get_entities_for_english(the_file):
    """
    Here we extract the elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of parts of speech (POS) tags. However, when an entity spans across multiple words, then POS tags alone are not sufficient. To fix this we basically save our previous text's info.

    requirements:
    - punctuation: "punct"
    - compound: "compound"
    - modifier: ["acl", "advcl", "advmod", "amod", "appos", "meta", "neg", "nmod", "npadvmod", "nummod", "poss", "prep", "quantmod", "relcl"]
    -
    """

    # init
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    # for holding the text that is associated with the current subject/object (can be multiple words)
    prefix = ""
    modifier = ""

    # going through each token
    for tok in nlp(the_file):
        if "punct" != tok.dep_.lower(): # if punctuation mark skip

            if "compound" == tok.dep_.lower():
                prefix = tok.text
                if "compound" == prv_tok_dep.lower(): # if the previous word was also a 'compound' then add to current text
                    prefix = prv_tok_text + " "+ tok.text

            # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
            if tok.dep_.lower().endswith("mod"):
                modifier = tok.text
                if "compound" == prv_tok_dep.lower(): # if previous word was also a 'compound' then add to current text
                    modifier = prv_tok_text + " "+ tok.text

            # extract first entity - subject
            if "subj" in tok.dep_.lower():
                ent1 = modifier +" "+ prefix + " "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # extract second entity - object
            if "obj" in tok.dep_.lower():
                ent2 = modifier +" "+ prefix +" "+ tok.text
                # reset info
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip().lower(), ent2.strip().lower()]

In [None]:
for file, doc_name in zip(file_list, doc_list):
    print("Document number " + str(count))
    count += 1

    sents = list(file.sents) # extract sentences of the document (only checks for ".")
    sents_pruned = [sentence for sentence in sents if len(sentence) > 5]

    print("Entities (Nodes - subject/object) Extraction")
    entity_pairs = [] # here "tqdm" is just used for creating a progress bar
    # Extracting the entity pairs of each sentence.
    if lan == "en": entity_pairs = [get_entities_for_english(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
    elif lan == "fr": entity_pairs = [get_entities_for_french(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
    elif lan == "de": entity_pairs = [get_entities_for_german(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)]
    else: print("Current chosen language does not exist!")

    print("Relations (Edges - predicate) Extraction")
    relations = []
    relations = [get_relation(str(i)) for i in tqdm(sents_pruned, position=0, leave=True)] # Here we assume predicate is the main verb in a sentence.

    triple_id.extend([doc_name for i in range(len(sents_pruned))])

    # pd.Series(relations).value_counts()[:10] # to visualize
    list_of_entity_pairs.append(entity_pairs)
    list_of_relations.append(relations)