In [1]:
# imports
import pandas as pd
import spacy
from spacy.matcher import Matcher
import networkx as nx
from tqdm import tqdm
from spacypdfreader import pdf_reader
import os
import re
import csv
from googletrans import Translator # don't forget to run "!pip install googletrans==3.1.0a0" before using this

In [2]:
# functions
def find_folder_with_type(given_path, doc_type): # returns all documents found in path
    doc_list = []
    for doc in os.listdir(given_path):
        if re.search (r'.*\%s$' % doc_type, doc) is not None: # even though this shows as error in IDE it's fine
            doc_list.append(doc)
    return doc_list

def folder_to_nlp_doc(given_path, list_of_doc_names, nlp):
    temp_list = []
    # the input should be a list of file contained in a folder
    for file_name in list_of_doc_names:
        file_path = given_path + "\\" + file_name
        temp_list.append(pdf_reader(file_path, nlp))
    return temp_list

def get_entities(the_file):
  """
  Here we extract the elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of parts of speech (POS) tags. However, when an entity spans across multiple words, then POS tags alone are not sufficient. To fix this we basically save our previous text's info.
  """

  # init
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  # for holding the text that is associated with the current subject/object (can be multiple words)
  prefix = ""
  modifier = ""

  # going through each token
  for tok in nlp(the_file):
    if tok.dep_ != "punct": # if punctuation mark skip

      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound": # if the previous word was also a 'compound' then add to current text
          prefix = prv_tok_text + " "+ tok.text

      # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound": # if previous word was also a 'compound' then add to current text
          modifier = prv_tok_text + " "+ tok.text

      # extract first entity - subject
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        # reset info
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""

      # extract second entity - object
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        # reset info
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""

      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

def get_relation(the_sentence):
  temp_doc = nlp(the_sentence)

  # creating the rule-based Matcher object
  matcher = Matcher(nlp.vocab)

  # defining the pattern - Each pattern should be a list of dicts and each pattern should be saved in another list
  # ex: patterns = [[{"LOWER": "hello"}, {"LOWER": "world"}], [{"ORTH": "Google"}, {"ORTH": "Maps"}]]

  # This pattern tries to find the ROOT word in the sentence. Once the ROOT is identified, then the pattern checks whether it is followed by a preposition (‘prep’) or an agent word. If yes, then it is added to the ROOT word.
  pattern = [{'DEP':'ROOT'}, # check for token with dependency label root
            {'DEP':'prep','OP':"?"}, # other stuff
            {'DEP':'agent','OP':"?"},
            {'POS':'ADJ','OP':"?"}]

  # matcher.add("match_id", "patterns")
  matcher.add("matching_1", [pattern])

  matches = matcher(temp_doc)

  k = len(matches) - 1
  if k == -1: # meaning no match was found so return null
    return None

  span = temp_doc[matches[k][1]:matches[k][2]]
  return span.text

def create_kg_csv_pandas(subjects, predicates, objects): # so source (subject) ----relation (predicate)----> target (object)
    # field names
    fields = ['Subject', 'Predicate', 'Object']
    filename = os.getcwd() + "\\head_relation_tail_data\\simple\\kg_of_simple_en.csv"
    # data rows of csv file
    rows = [[subjects[i], predicates[i], objects[i]] for i in range(len(subjects))] # best to check if we have empty values

    temp_df = pd.DataFrame(rows, columns = fields)
    temp_df.to_csv(filename, header = fields)

def create_kg_csv(subjects, predicates, objects): # so source(subject) -relation(predicate)-> target(object)
    # field names
    fields = ['Subject', 'Predicate', 'Object']
    filename = os.getcwd() + "\\head_relation_tail_data\\simple\\kg_of_simple_en.csv"

    rows = [[subjects[i], predicates[i], objects[i]] for i in range(len(subjects))]

    # find out empty and None strings, replacing it with "-"
    for i in range(len(rows)):
        for j in range(len(rows[0])): # so len 3
            if rows[i][j] == "" or rows[i][j] is None: rows[i][j] = "-"

    # writing to csv file
    with open(filename, 'w', newline = '') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(fields) # first writing fields
        csv_writer.writerows(rows) # now the remaining record

# German: de, French: fr, but can work for any other languages too
def simple_translator(language): # assumes that en kg csv is already created
    translator = Translator()

    filename = os.getcwd() + "\\head_relation_tail_data\\simple"
    filename_english = filename + "\\kg_of_simple_en.csv"
    filename_assigned_language = filename + "\\kg_of_simple_" + language + ".csv"

    with open(filename_english, mode ="r") as file_original, open(filename_assigned_language, mode = "w", newline = '') as file_2:
      csv_reader = csv.reader(file_original)
      csv_writer = csv.writer(file_2)

      for lines in csv_reader: # each line is a list of 3 elements (source - relation - target)
        source_en, relation_en, target_en = lines[0], lines[1], lines[2]

        # src(source) = english, dest(destination) = language to translate to
        translated_source, translated_relation, translated_target = translator.translate(source_en, src = "en", dest = language), translator.translate(relation_en, src = "en", dest = language), translator.translate(target_en, src = "en", dest = language)

        temp_row = [translated_source.text, translated_relation.text, translated_target.text]
        csv_writer.writerow(temp_row)

# PIPELINE

#### 1. Loading Pipeline

In [3]:
### Loading Pipeline ###

nlp = spacy.load("en_core_web_sm") # A small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.

#### 2. Importing Dataset

In [4]:
### Importing Dataset ###

# automatically extracting path
current_path = os.getcwd()
data_path = current_path.replace("src\\main", "data\\en\\directives_pdf")

doc_list = find_folder_with_type(data_path, '.pdf') # detection of pdf files in the folder

# importing the file, here doc is like a "list" of tokens (each tok is either a word, number, ...)
file_list = folder_to_nlp_doc(data_path, doc_list, nlp)

#### 3. Entities (Nodes) and Relations (Edges) Extraction

In [5]:
list_of_entity_pairs = [] # this is a list of lists (so each index contains the info of a data file)
list_of_relations = []
count = 1

for file in file_list:
    print("Document number " + str(count))
    count += 1

    sents = list(file.sents) # extract sentences of the document (only checks for ".")

    print("Entities (Nodes - subject/object) Extraction")
    entity_pairs = []
    for i in tqdm(sents): # here "tqdm" is just used for creating a progress bar
      entity_pairs.append(get_entities(str(i))) # Extracting the entity pairs of each sentence.

    print("Relations (Edges - predicate) Extraction")
    relations = []
    relations = [get_relation(str(i)) for i in tqdm(sents)] # Here we assume predicate is the main verb in a sentence.

    # pd.Series(relations).value_counts()[:10] # to visualize
    list_of_entity_pairs.append(entity_pairs)
    list_of_relations.append(relations)

Document number 1
Entities (Nodes - subject/object) Extraction


100%|██████████| 148/148 [00:02<00:00, 66.17it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 148/148 [00:02<00:00, 70.53it/s]


Document number 2
Entities (Nodes - subject/object) Extraction


100%|██████████| 181/181 [00:02<00:00, 70.31it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 181/181 [00:02<00:00, 64.28it/s]


Document number 3
Entities (Nodes - subject/object) Extraction


100%|██████████| 373/373 [00:05<00:00, 71.61it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 373/373 [00:05<00:00, 68.93it/s]


Document number 4
Entities (Nodes - subject/object) Extraction


100%|██████████| 131/131 [00:01<00:00, 73.77it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 131/131 [00:02<00:00, 64.08it/s]


Document number 5
Entities (Nodes - subject/object) Extraction


100%|██████████| 208/208 [00:02<00:00, 73.45it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 208/208 [00:02<00:00, 75.90it/s]


Document number 6
Entities (Nodes - subject/object) Extraction


100%|██████████| 270/270 [00:03<00:00, 67.67it/s]


Relations (Edges - predicate) Extraction


100%|██████████| 270/270 [00:03<00:00, 68.80it/s]


#### 4. Build Knowledge Graph

In [6]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

# kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
create_kg_csv(source, relations, target)


#### 5. Translate

In [7]:
simple_translator("de") # CAUTION: since this uses API it can "timeout", so if happens just run it again

In [8]:
simple_translator("fr")

# TRASH

In [9]:
"""We will use the networkx library to create a network from this dataframe. Which is going to be a directed graph allowing us to draw a line from subject to object. However, we also need to use pyvis library because networkx's visualize method is currently not working."""

# create a directed-graph from a dataframe
# directed_graph = nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
#
# from pyvis.network import Network
#
# net = Network(notebook=True, cdn_resources='remote')
#
# net.from_nx(directed_graph)
# net.show("example.html")

NameError: name 'kg_df' is not defined