# Pre-Requesites Installation

In [2]:
# Install the Spacy Dependencies.
import sys
!{sys.executable} -m pip install -U pip setuptools wheel
!{sys.executable} -m pip install -U spacy
!{sys.executable} -m pip install -U nltk
!{sys.executable} -m pip install -U pandas
from spacy import displacy
!{sys.executable} -m pip install -U numpy



In [18]:
# Download the Spacy model.
!{sys.executable} -m spacy download en_core_web_trf
!{sys.executable} -m spacy download en_core_web_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting en-core-web-trf==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.1/en_core_web_trf-3.4.1-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Rule-Based Matching Using Spacy

![hearst_patterns-768x435.png](attachment:hearst_patterns-768x435.png)

The simple rule-based methods work well for information extraction tasks.

We have to be extremely creative to come up with new rules to capture different patterns. It is difficult to build patterns that generalize well across different sentences.

In [58]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

# load spaCy model
nlp = spacy.load("en_core_web_trf")

# sample text 
# text =  "GDP in developing countries such as Vietnam will continue growing at a high rate." 
# text =  "Here is how you can keep your car and other vehicles clean."
# text = "Eight people, including two children, were injured in the explosion"
# text = "A healthy eating pattern includes fruits, especially whole fruits."
text = "Tableau was recently acquired by Salesforce." 

# create a spaCy object 
doc = nlp(text)

# print token, dependency, POS tag 
for tok in doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)


#mine information from text based on these Hearst Patterns.
#define the pattern 
## Have a look around the terms “such” and “as” . 
#Pattern: X such as Y
pattern = [
           {'DEP':'amod', 'OP':"?"}, # adjectival modifier
           {'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}] #proper noun
# Pattern: X and/or Y
pattern2 = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}]
# Pattern: X, including Y
pattern3 = [{'DEP':'nummod','OP':"?"}, # numeric modifier 
           {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'including'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}]
# Pattern: X, especially Y
pattern4 = [{'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}, 
           {'IS_PUNCT':True}, 
           {'LOWER': 'especially'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 

pd.set_option('display.max_colwidth', 200)

# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern, pattern2, pattern3, pattern4]) 

matches = matcher(doc)
if(matches):
    span = doc[matches[0][1]:matches[0][2]]
    print('The Pattern found in the statement is......')
    print(span.text)
else:
    print('no patterns matched the statement.....')

Tableau --> nsubjpass --> PROPN
was --> auxpass --> AUX
recently --> advmod --> ADV
acquired --> ROOT --> VERB
by --> agent --> ADP
Salesforce --> pobj --> PROPN
. --> punct --> PUNCT
no patterns matched the statement.....



# Subtree Matching for Relation Extraction


In [49]:
def subtree_matcher(doc):
  subjpass = 0

  for i,tok in enumerate(doc):
    # find dependency tag that contains the text "subjpass"    
    if tok.dep_.find("subjpass") == True:
      subjpass = 1

  x = ''
  y = ''
  verb = ''

  # if subjpass == 1 then sentence is passive
  if subjpass == 1:
    for i,tok in enumerate(doc):
      if tok.dep_.find("subjpass") == True:
        y = tok.text
      if tok.dep_.endswith("obj") == True:
        x = tok.text
        
      
  
  # if subjpass == 0 then sentence is not passive
  else:
    for i,tok in enumerate(doc):
      if tok.dep_.endswith("subj") == True:
        x = tok.text

      if tok.dep_.endswith("obj") == True:
        y = tok.text

  return x,y

In [50]:
text_2 = "Careem, a ride hailing major in middle east, was acquired by Uber." 

doc_2 = nlp(text_2) 
subtree_matcher(doc_2)

('Uber', 'Careem')

In [51]:
text_3 = "Salesforce recently acquired Tableau." 
doc_3 = nlp(text_3) 
subtree_matcher(doc_3)

('Salesforce', 'Tableau')

In [52]:
text_3 = "Tableau was recently acquired by Salesforce." 
doc_3 = nlp(text_3) 
subtree_matcher(doc_3)

('Salesforce', 'Tableau')

# Knowledge Graph

In [None]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [None]:
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")
candidate_sentences.shape