# Importing Dependencies & Loading dataset

In [136]:
# imports
import pandas as pd
import spacy
from spacy.matcher import Matcher
import networkx as nx
from tqdm import tqdm
from spacypdfreader import pdf_reader
import os

#### Loading Pipeline

In [137]:
nlp = spacy.load("en_core_web_sm") # A small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.

#### Importing Dataset

In [138]:
# automatically extracting path
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\Directive_(EU)_2016_343_en.pdf")

# importing the file, here doc is like a "list" of tokens (each tok is either a word, number, ...)
doc = pdf_reader(temp_path, nlp)

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

# Sentence Segmentation

In [139]:
# an example
for tok in doc[0:20]:
  print(tok.text, "...", tok.dep_)

11.3.2016 ... compound


 ... dep
EN ... compound
    

 ... dep
Official ... nmod
  ... dep
Journal ... conj
of ... prep
  ... dep
the ... det
European ... compound
Union ... compound


 ... dep
L ... appos
65/1 ... nummod


 ... dep
I ... compound


 ... dep
( ... punct
Legislative ... compound


# Entities (Nodes) Extraction
Here we will extract these elements in an unsupervised manner, i.e., we will use the grammar of the sentences. The extraction of a single word entity from a sentence is not a tough task. We can easily do this with the help of parts of speech (POS) tags. However, when an entity spans across multiple words, then POS tags alone are not sufficient. To fix this we basically save our previous text's info.

In [140]:
def get_entities(the_file):
  # init
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag (the relationship between any two words is marked by a dependency tag) of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  # for holding the text that is associated with the current subject/object (can be multiple words)
  prefix = ""
  modifier = ""

  # going through each token
  for tok in nlp(the_file):
    if tok.dep_ != "punct": # if punctuation mark skip

      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound": # if the previous word was also a 'compound' then add to current text
          prefix = prv_tok_text + " "+ tok.text

      # check if modifier (a modifier gives information about another word in the same sentence e.g., blue house)
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound": # if previous word was also a 'compound' then add to current text
          modifier = prv_tok_text + " "+ tok.text

      # extract first entity - subject
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        # reset info
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""

      # extract second entity - object
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        # reset info
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""

      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

In [141]:
# saving each sentence of the document as a list to be used later
sents = list(doc.sents) # extract sentences of the document (only checks for ".")

In [142]:
# this cell is for testing "get_entities" function
temp = sents[11]

print("The sentence: " + str(temp) + "\n")
print("The subject and object of the sentence: \n" + str(get_entities(str(temp))))

The sentence: 

(5)  

Although  the  Member  States  are  party  to  the  ECHR  and  to  the  ICCPR,  experience  has  shown  that  this  in  itself 
does not always  provide  a sufficient degree  of  trust in  the criminal  justice  systems of other  Member States.

The subject and object of the sentence: 
['ICCPR  that', 'other Member States']


Extracting the entity pairs of all the sentences in our data.

In [143]:
entity_pairs = []

for i in tqdm(sents): # here "tqdm" is just used for creating a progress bar
  entity_pairs.append(get_entities(str(i)))

100%|██████████| 181/181 [00:02<00:00, 77.74it/s]


In [144]:
# checking the entity pairs that are extracted
entity_pairs[10:20]

[['which', 'principle'],
 ['ICCPR  that', 'other Member States'],
 ['', 'criminal  proceedings'],
 ['who', 'vulnerable  measure'],
 ['European Roadmap it', 'citizens'],
 ['exhaustive  character', 'better  cooperation'],
 ['', 'Council'],
 ['purpose', 'trial'],
 ['', 'criminal  matters'],
 ['', 'Member States']]

# Relations (Edges) Extraction
Here we assume that the predicate is actually the main verb in a sentence.

In [145]:
def get_relation(the_sentence):
  temp_doc = nlp(the_sentence)

  # creating the rule-based Matcher object
  matcher = Matcher(nlp.vocab)

  # defining the pattern - Each pattern should be a list of dicts and each pattern should be saved in another list
  # ex: patterns = [[{"LOWER": "hello"}, {"LOWER": "world"}], [{"ORTH": "Google"}, {"ORTH": "Maps"}]]

  # This pattern tries to find the ROOT word in the sentence. Once the ROOT is identified, then the pattern checks whether it is followed by a preposition (‘prep’) or an agent word. If yes, then it is added to the ROOT word.
  pattern = [{'DEP':'ROOT'}, # check for token with dependency label root
            {'DEP':'prep','OP':"?"}, # other stuff
            {'DEP':'agent','OP':"?"},
            {'POS':'ADJ','OP':"?"}]

  # matcher.add("match_id", "patterns")
  matcher.add("matching_1", [pattern])

  matches = matcher(temp_doc)

  k = len(matches) - 1
  if k == -1: # meaning no match was found so return null
    return None

  span = temp_doc[matches[k][1]:matches[k][2]]
  return span.text

In [146]:
# testing
get_relation("John completed the task")

'completed'

In [147]:
relations = [get_relation(str(i)) for i in tqdm(sents)]

100%|██████████| 181/181 [00:02<00:00, 75.41it/s]


In [148]:
# visualize
pd.Series(relations).value_counts()[:10]

accused       12
remain         5
ensure         4
are            4
is             4
understood     4
apply          3
provided       3
take           3
violated       2
dtype: int64

# Build Knowledge Graph

In [149]:
# creating a dataframe of entities and predicates:

# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

We will use the networkx library to create a network from this dataframe. Which is going to be a directed graph allowing us to draw a line from subject to object. However, we also need to use pyvis library because networkx's visualize method is currently not working.

In [150]:
# create a directed-graph from a dataframe
directed_graph = nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())

In [151]:
from pyvis.network import Network

net = Network(notebook=True, cdn_resources='remote')

net.from_nx(directed_graph)
net.show("example.html")

example.html


# Knowledge Graph Database and Query