# **RULE BASED APPROACH**

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re 
import string 
import nltk 
import spacy 
import math
import os
from spacy.matcher import Matcher
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [5]:
#Downloading the required spacy vocab file
# os.system('python -m spacy download en_core_web_smspacy download en_core_web_sm')

In [6]:
train_df = pd.read_json('../../data/raw/raw_nyt/raw_train.json', lines = True)

In [7]:
train_df['sentText'].head(5)

0                                                                                              Massachusetts ASTON MAGNA Great Barrington ; also at Bard College , Annandale-on-Hudson , N.Y. , July 1-Aug .
1                                                                                                                                       North Carolina EASTERN MUSIC FESTIVAL Greensboro , June 25-July 30 .
2                               It will be the final movie credited to Debra Hill , a film producer and native of Haddonfield , who produced '' Halloween '' and was considered a pioneering woman in film .
3    In a 3-0 victory over the Crew on Saturday in Columbus , Ohio , goalkeeper Zach Wells stopped Kyle Martino 's penalty kick , only the third unsuccessful penalty in 20 attempts in M.L.S. this season .
4    The United States ambassador to Mexico , Tony Garza , said in a statement that he had directed the American Consulate in Nuevo Laredo to reopen on Monday , a week after he ord

In [8]:
#Loading vocab
loader = spacy.load('en_core_web_sm')

In [9]:
# Matcher class object
matcher = Matcher(loader.vocab)

#define the pattern
pattern = [{'DEP':'compound', 'OP':"?"},
           {'POS':'PROPN'},
           {'POS': 'VERB'},
           {'DEP': 'agent', 'OP':"?"},
           {'POS': 'PROPN'}]


#add pattern
matcher.add("matching_1", None, pattern)

# Selecting list of sentences having pattern
matched_phrases = []
for i in tqdm(range(len(train_df['sentText']))):
  doc = loader(train_df['sentText'][i])
  matches = matcher(doc)
  if(len(matches)):
    span = doc[matches[0][1]:matches[0][2]]
    matched_phrases.append([train_df['sentText'][i],span.text])

100%|██████████| 56196/56196 [18:01<00:00, 51.98it/s]


**SAVE IN NUMPY ARRAY**

In [10]:
from numpy import asarray
from numpy import save
# define data
data = asarray(matched_phrases)
# save to npy file
save('../../data/processed/Rule_Based_Data.npy', data)

**LOAD NUMPY ARRAY**

In [11]:
from numpy import load
data = load('../../data/processed/Rule_Based_Data.npy')

In [12]:
def get_entities_relations(matcher, sentence):
  ## Match phrase and return e&r for the given matcher class else return -1

  sentence = loader(sentence)
  matches = matcher(sentence)
  entity1 = -1
  entity2 = -1
  relation = -1
  if(len(matches)):
    span = sentence[matches[0][1]:matches[0][2]]
    useful_text = span.text
    sentence = loader(str(useful_text))
    sent_last_index = len(sentence)-1
    entity2 = sentence[sent_last_index]
    if (sentence[1].pos_ == 'PROPN'):
      entity1 = (" ".join([str(j.text) for j in sentence[0:2]]))
    else:
      entity1 = sentence[0]
    for word in sentence:
      if((word.pos_ == 'VERB') or (word.pos_ == 'ADJ')):
        relation = word.text
        break
  return entity1, entity2, relation

In [13]:
## Extracting entities and relations for particular pattern

entity1 = []
entity2 = []
relations = []
sentText = []
for index in tqdm(range(len(data))):
  sentence = loader(str(data[index][1]))
  sent_last_index = len(sentence)-1
  entity2.append(sentence[sent_last_index])
  if (sentence[1].pos_ == 'PROPN'):
    entity1.append((" ".join([str(j.text) for j in sentence[0:2]])))
  else:
    entity1.append(sentence[0])
  sentText.append(str(data[index][0]))
  
  for word in sentence:
    if((word.pos_ == 'VERB') or (word.pos_ == 'ADJ')):
      relations.append(word.text)
      break
  if(len(entity2) > len(relations)):
    entity1.pop()
    entity2.pop()
    sentText.pop()

100%|██████████| 1510/1510 [00:16<00:00, 91.46it/s]


In [14]:
# Total Entities, Relations and Sentences Extracted
print(
    "Total Entity1 = ", len(entity1),
    "\n Total Entity2 = ", len(entity2),
    "\n Total Relation = ", len(relations),
    "\n Total Sentence = ", len(sentText)
)

Total Entity1 =  1463 
 Total Entity2 =  1463 
 Total Relation =  1463 
 Total Sentence =  1463


In [15]:
dictionary = {"Sentence": sentText, "Entity1" : entity1, "Entity2" : entity2, "relation" : relations}

In [16]:
df = pd.DataFrame(dictionary)

In [17]:
df.head()

Unnamed: 0,Sentence,Entity1,Entity2,relation
0,"In a 3-0 victory over the Crew on Saturday in Columbus , Ohio , goalkeeper Zach Wells stopped Kyle Martino 's penalty kick , only the third unsuccessful penalty in 20 attempts in M.L.S. this season .",Zach Wells,Kyle,stopped
1,"NEARLY two years after Charles Taylor fled Monrovia under pressure from advancing rebels and a force of Marines on ships off Liberia , he sits exiled in Nigeria , plotting to undermine an internat...",Charles Taylor,Monrovia,fled
2,"Defensively , the Giants rarely touched Eagles quarterback Donovan McNabb , and the secondary kept away from Philadelphia receivers as if they were toxic .",Eagles,Donovan,quarterback
3,"On Dec. 25 , in a driving rain , the news that Charles Taylor had attacked Liberia reached Monrovia .",Liberia,Monrovia,reached
4,"Last week , Mr. O'Connell sipped Sprite in the tchotchke-filled living room of his two-family brick house in Bay Ridge , Brooklyn , which he shares with a grown son and daughter , and recalled his...",Mr. O'Connell,Sprite,sipped


In [18]:
df.to_csv('../../reports/results/Rule_Based/all_matched_pattern.csv')

### **TEST ON SENTENCE**

In [19]:
# Getting Entities and Relations for the following sentence ----
sentence = "Steve Jobs Founded Apple"
matcher = Matcher(loader.vocab)

pattern = [{'DEP':'compound', 'OP':"?"}, # adjectival modifier
           {'POS':'PROPN'},
           {'POS':'NOUN', 'OP': "?"},
           {'POS': 'VERB'},
           {'DEP': 'agent', 'OP':"?"},
           {'POS': 'PROPN'}]

matcher.add("matching_1", None, pattern)
# loader = spacy.load('en_core_web_sm')
e1, e2, rel = get_entities_relations(matcher,sentence)

In [20]:
print(
    "\n==============================\n",
    "Entity 1 = ", e1,
    "\nRelation = ", rel,
    "\nEntity 2 = ", e2,
    "\n==============================\n",
)



print("\n\n\n If (-1) => following content is not found")


 Entity 1 =  Steve Jobs 
Relation =  Founded 
Entity 2 =  Apple 




 If (-1) => following content is not found
