# Rule-Based Matching for NER

## #1. Setup development environment

###Update & import Python modules

In [None]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg

# spaCy
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# general Python modules
import json
from datetime import datetime
from pprint import pprint

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 17 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


### Get access to Firebase and Drive

In [None]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
with open(DRIVE_PATH + "/ie_course/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# create Firestore database instance
firebase_admin.initialize_app(credential)
db = firestore.client()
print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive
Stablished access to Firestore


### Retrieve main data structures

In [None]:
# retrieve Text Record from JSON file
with open(DRIVE_PATH + "/ie_course/retrieved_data/text_record.json") as f:
  text_rec = json.load(f)
  print(f"Retrieved text record")

# retrieve list of entities gazetteers
with open(DRIVE_PATH + "/ie_course/assets/entities_gazetteers.json") as f:
  entities_gazetteers_list = json.load(f)
  print(f"Retrieved entities gazetteers list")

# retrieve ignore list of entities gazetteers
with open(DRIVE_PATH + "/ie_course/assets/entities_gazetteers_ignore_list.json") as f:
  entities_gazetteers_ignore_list = json.load(f)
  print(f"Retrieved entities gazetteers' ignore list")

# retrieve entities
with open(DRIVE_PATH + "/ie_course/assets/entities.json") as f:
  entities = json.load(f)
  print(f"Retrieved entities")

Retrieved text record
Retrieved entities gazetteers list
Retrieved entities gazetteers' ignore list
Retrieved entities


## #2. Create custom pipeline


### Create custom pipeline component



In [None]:
""" Custom pipeline Component: entities gazetteer function """

@Language.component("entities_gazetteer")
def entities_gazetteer(doc):
  # set up and extend structure of default span object
  span = Span(doc, 0, 0, "")
  span.set_extension("qid", default=None, force=True)
  span.set_extension("label", default=None, force=True)
  span.set_extension("wd_name", default=None, force=True)

  # identify matches of the gazetteers contained in Doc object (text)
  matches = matcher(doc)
  # convert matches to Span objects
  spans = [doc[start:end] for _, start, end in matches]
  # filter overlaping matches (Span objs) to keep gazetteers uniqueness
  filtered_matches = spacy.util.filter_spans(spans)

  # loop unique matches of gazetteers
  for match in filtered_matches:
    # skip if matched gazetter is in ignore list
    if match.text in entities_gazetteers_ignore_list:
      print(f"-- Skipped '{match}' due it's in ignore list!")
      continue
    # find matched gazetters in issues dictionary to get entities' Wikidata info
    # usually only one entity is found, but some gazetteer finds more than one
    matched_entities = [i for i in entities if match.text == i["name"] or match.text in i["aliases"]]
    if len(matched_entities):
      entity = Span(doc, match.start, match.end, label=matched_entities[0]["label"])

      # set attributes
      if len(matched_entities) == 1:
        entity._.label = matched_entities[0]["label"]
      elif len(matched_entities) > 1:
        entity._.label = [e["label"] for e in matched_entities]

      entity.set_extension("qid", default=None, force=True)
      if len(matched_entities) == 1:
        entity._.qid = matched_entities[0]["qid"]
      elif len(matched_entities) > 1:
        entity._.qid = [e["qid"] for e in matched_entities]

      entity.set_extension("wd_name", default=None, force=True)
      if len(matched_entities) == 1:
        entity._.wd_name = matched_entities[0]["name"]
      elif len(matched_entities) > 1:
        entity._.wd_name = [e["name"] for e in matched_entities]

      # modify the provided entity spans, leaving the rest unmodified
      doc.set_ents([entity], default="unmodified")

  return doc

# create pipeline loaded with a pretrained statistical model (English/lg)
nlp = spacy.load("en_core_web_lg", exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
nlp.add_pipe("sentencizer")

# add custom component to pipeline
nlp.add_pipe("entities_gazetteer", last=True)

# initialize spaCY phrase matcher (rule-based)
matcher = PhraseMatcher(nlp.vocab, None)

# load gazetteers (issues) as matcher patterns
patterns = [nlp.make_doc(gazetteer) for gazetteer in entities_gazetteers_list]
matcher.add("gazetteers", patterns)

# see pipeline components
print(nlp.pipe_names)

# analize pipeline
pprint(nlp.analyze_pipes(pretty=True))

['ner', 'sentencizer', 'entities_gazetteer']
[1m

#   Component            Assigns               Requires   Scores          Retokenizes
-   ------------------   -------------------   --------   -------------   -----------
0   ner                  doc.ents                         ents_f          False      
                         token.ent_iob                    ents_p                     
                         token.ent_type                   ents_r                     
                                                          ents_per_type              
                                                                                     
1   sentencizer          token.is_sent_start              sents_f         False      
                         doc.sents                        sents_p                    
                                                          sents_r                    
                                                                                     
2  

## #3. Extract Named-Entities from text

### Process text for NER

In [None]:
# initialize main container of text
main_text_container = []
# main_text_woc_container = []
text = text_rec["texts"]

# split text in paragraphs
for index, paragraph in enumerate(text):
  # split paragraph in sentences
  sentences = [sent.text for sent in nlp(paragraph).sents]

  # process sentences individually in a stream (multi-thread)
  for doc in nlp.pipe(sentences, batch_size=50):
    sent = doc.text  # sentence
    chosen_sentence = [sent, {"entities": []}]
    for ent in doc.ents:
      if ent._.qid:
        chosen_sentence[1]["entities"].append([ent.start_char, ent.end_char, ent._.qid, ent.text, ent.label_, ent._.label, ent._.wd_name])
    if len(chosen_sentence[1]["entities"]):
      main_text_container.append(chosen_sentence)
      print()
      print(f"++ {sent}")
      for ent in doc.ents:
        if ent._.qid:
          print(f"    {ent.text, ent.start_char, ent.end_char, ent.label_, ent._.label, ent._.qid, ent._.wd_name}")
      print()

# save record in JSON file
if len(main_text_container):
  with open(DRIVE_PATH + "/ie_course/output/extracted_sentences.json", "w", encoding = "utf-8") as f:
    json.dump(main_text_container, f, ensure_ascii = False, indent = 2)
    print()
    print(f"Saved {len(main_text_container)} extracted_sentences")


++ Florida became more flame-like, but it’s going to be under control.
    ('Florida', 0, 7, 'GPE', 'GPE', 'Q812', 'Florida')


++ They don’t talk about Mexico, Mexico and Brazil and still parts of Europe, which actually got hit sooner than us, so it’s a little ahead of us in that sense.
    ('Mexico', 22, 28, 'GPE', 'GPE', 'Q96', 'Mexico')
    ('Mexico', 30, 36, 'GPE', 'GPE', 'Q96', 'Mexico')
    ('Brazil', 41, 47, 'GPE', 'GPE', 'Q155', 'Brazil')


++ But you take a look, why don’t they talk about Mexico, which is not helping us?
    ('Mexico', 47, 53, 'GPE', 'GPE', 'Q96', 'Mexico')


++ And all I can say is thank God I built most of the wall because if I didn’t have the wall up we would have a much bigger problem with Mexico.
    ('Mexico', 134, 140, 'GPE', 'GPE', 'Q96', 'Mexico')

-- Skipped 'right' due it's in ignore list!
-- Skipped 'right' due it's in ignore list!
-- Skipped 'right' due it's in ignore list!

++ Look, I take responsibility always for everything because it’s ultim

### Test pipeline with one sentence

In [None]:
sentence = "POTUS 46 was born and raised in Scranton, Pennsylvania, and moved with his family to Delaware in 1953 when he was ten years old."
doc = nlp(sentence)
for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_, ent._.qid, ent._.wd_name)

POTUS 46 0 8 PERSON Q6279 Joe Biden
Scranton 32 40 GPE None None
Pennsylvania 42 54 GPE None None
Delaware 85 93 GPE None None
1953 97 101 DATE None None
ten years old 114 127 DATE None None


## #4. Utils (optional)

### Retrieve text record from Firestore

In [None]:
# retrieve text record from Firestore
texts_ref = db.collection("texts")
text_doc = texts_ref.document("revcomblogtranscriptsdonaldtrumpchriswallaceinterviewtranscriptjuly19").get()
text_rec = text_doc.to_dict()

# convert timestamp to datetime string
retrieval_date = datetime.datetime.fromtimestamp(text_rec["retrieval_date"].timestamp())
text_rec["retrieval_date"] = retrieval_date.strftime("%m/%d/%Y, %H:%M:%S")

# save record in JSON file
with open(DRIVE_PATH + "/ie_course/output/text_record2.json", "w", encoding = "utf-8") as f:
  json.dump([text_rec], f, ensure_ascii = False, indent = 2)
  print(f"Saved text record")

Saved text record


---

# Querying the Wikidata API

In [None]:
import requests

## Query by keyword

In [None]:
label_value = "job"

query = "SELECT ?item WHERE { \
  ?item rdfs:label '" + label_value + "'@en . \
  SERVICE wikibase:label { bd:serviceParam wikibase:language 'en'. }}"

response = requests.get("https://query.wikidata.org/sparql",
                        params={"format": "json", "query": query},
                        stream=True)
print(response)
results = response.json()
pprint(results)

<Response [200]>
{'head': {'vars': ['item']},
 'results': {'bindings': [{'item': {'type': 'uri',
                                    'value': 'http://www.wikidata.org/entity/Q192581'}},
                          {'item': {'type': 'uri',
                                    'value': 'http://www.wikidata.org/entity/Q25211948'}}]}}


## Query by QID/PID

In [None]:
# query the Wikidata API to retrieve an entity's information
# fetch entity info from the Wikidata namespace URL

# for entity
namespace_url = f"https://www.wikidata.org/wiki/Special:EntityData/Q7322.json"
r = requests.get(namespace_url, params={"format": "json"})
# simplify access to root elements of JSON object
pprint(r.json()["entities"][f"Q7322"])

# for property
namespace_url = f"https://www.wikidata.org/wiki/Special:EntityData/P61.json"
r = requests.get(namespace_url, params={"format": "json"})
# simplify access to root elements of JSON object
pprint(r.json()["entities"][f"P61"])

# Tip: save the result into a JSON file

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                                  'value': '38985'},
                                    'hash': 'b04c50f436b9df232bfd2ca6f6e2b2b0999df5d2',
                                    'property': 'P3762',
                                    'snaktype': 'value'},
                       'rank': 'normal',
                       'type': 'statement'}],
            'P3788': [{'id': 'Q7322$e666f271-4ac9-05bc-3cdd-8f1d18700878',
                       'mainsnak': {'datatype': 'external-id',
                                    'datavalue': {'type': 'string',
                                                  'value': '000026108'},
                                    'hash': 'c08295985554ae4f1f3e0f7de1290766df01e092',
                                    'property': 'P3788',
                                    'snaktype': 'value'},
                       'rank': 'normal',
                       'type': 'statement'}],
       