# Entity linking with Wikidata (NERD/EL)

## #1. Setup development environment

### Update & import Python modules

In [None]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg

# spaCy
import spacy
from spacy.language import Language
from spacy.tokens import DocBin, Span
from spacy.matcher import PhraseMatcher
from spacy.kb import KnowledgeBase
from spacy.training import Example
from spacy.ml.models import load_kb
from spacy.util import minibatch, compounding

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# general Python modules
import json
import datetime
import requests
import csv
import random
from collections import Counter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-01-10 09:28:24.138808: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')




### Get access to Firebase and Drive

In [None]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
# with open(DRIVE_PATH + "/ie_course/credentials/firebase_credentials.json") as f:
#   credential = json.load(f)
# credential = credentials.Certificate(credential)

# create Firestore database instance
# firebase_admin.initialize_app(credential)
# db = firestore.client()
# print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive


## #2. Disambiguate and link NEs using a KG

### Define input and output paths/files

In [None]:
# input files
ents_file = DRIVE_PATH + "/ie_course/assets/entities.csv"
annot_text_file = DRIVE_PATH + "/ie_course/assets/emerson_annotated_text.jsonl"

# output files
kb_dir = DRIVE_PATH + "/ie_course/output/ml_el/kb"
nlp_dir = DRIVE_PATH + "/ie_course/output/ml_el/my_nlp"
train_corpus = DRIVE_PATH + "/ie_course/output/ml_el/train_corpus"
test_corpus = DRIVE_PATH + "/ie_course/output/ml_el/test_corpus"
nlp_el_dir = DRIVE_PATH + "/ie_course/output/ml_el/my_el_nlp"

### Create NLP pipeline and Knowledge Graph

In [None]:
""" Step 1: create the Knowledge Base in NLP pipeline and write it to file """

# Helper function to read in the pre-defined entities we want to disambiguate to
def load_entities():
  names = dict()
  descriptions = dict()
  # read and iterate entities and split it into two dicts
  with open(ents_file, newline="") as f:
    entities = csv.reader(f, delimiter=",")
    # print(f"Retrieved entities")
    for row in entities:
      qid = row[0]
      name = row[1]
      desc = row[2]
      names[qid] = name
      descriptions[qid] = desc
  # return "names" {id,names} and "descriptions" {id,descriptions}
  return names, descriptions


# First: create a simple model with an NER component
# To ensure we get the correct entities for this demo, add a simple entity_ruler as well.
nlp = spacy.load("en_core_web_lg", exclude="parser, tagger, lemmatizer")
ruler = nlp.add_pipe("entity_ruler", after="ner")
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "emerson"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe("sentencizer", first=True)

name_dict, desc_dict = load_entities()

kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

for qid, desc in desc_dict.items():
  desc_doc = nlp(desc)
  desc_enc = desc_doc.vector
  # Set arbitrary value for frequency
  kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)

for qid, name in name_dict.items():
  # set 100% prior probability P(entity|alias) for each unique name
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])

qids = name_dict.keys()
probs = [0.3 for qid in qids]
# ensure that sum([probs]) <= 1 when setting aliases
kb.add_alias(alias="Emerson", entities=qids, probabilities=probs)  #

print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")
print()

# store knowledgebase and NLP pipeline
kb.to_disk(kb_dir)
print(f"Saved KB in: {kb_dir}")
nlp.to_disk(nlp_dir)
print(f"Saved NLP pipeline in: {nlp_dir}")

Entities in the KB: ['Q215952', 'Q312545', 'Q48226']
Aliases in the KB: ['Roy Stanley Emerson', 'Emerson Ferreira da Rosa', 'Ralph Waldo Emerson', 'Emerson']

Saved KB in: /content/gdrive/My Drive/ie_course/output/ml_el/kb
Saved NLP pipeline in: /content/gdrive/My Drive/ie_course/output/ml_el/my_nlp


### Create Corpora (training and test datasets)

In [None]:
""" Step 2: Once we have done the manual annotations, create corpora in spaCy format. """

##############################################################
# TODO: create annotated dataset for training before this step
##############################################################

nlp = spacy.load(nlp_dir, exclude="parser, tagger")
docs = []
gold_ids = []

with open(annot_text_file,"r", encoding="utf8") as f:
  for line in f:
    example = json.loads(line)
    sentence = example["text"]
    if example["answer"] == "accept":
      QID = example["accept"][0]
      doc = nlp.make_doc(sentence)
      gold_ids.append(QID)
      # we assume only 1 annotated span per sentence, and only 1 KB ID per span
      entity = doc.char_span(
        example["spans"][0]["start"],
        example["spans"][0]["end"],
        label=example["spans"][0]["label"],
        kb_id=QID,
      )
      doc.ents = [entity]
      for i, t in enumerate(doc):
        doc[i].is_sent_start = i == 0
      docs.append(doc)

print("Statistics of manually annotated data:")
print(Counter(gold_ids))
print()

train_docs = DocBin()
test_docs = DocBin()
for QID in ["Q312545", "Q48226", "Q215952"]:
  indices = [i for i, j in enumerate(gold_ids) if j == QID]
  # first 8 in training
  for index in indices[0:8]:
    train_docs.add(docs[index])
  # last 2 in test
  for index in indices[8:10]:
    test_docs.add(docs[index])

train_docs.to_disk(train_corpus)
print(f"Saved train corpus in: {train_corpus}")
test_docs.to_disk(test_corpus)
print(f"Saved test corpus in: {test_corpus}")

Statistics of manually annotated data:
Counter({'Q312545': 10, 'Q48226': 10, 'Q215952': 10})

Saved train corpus in: /content/gdrive/My Drive/ie_course/output/ml_el/train_corpus
Saved test corpus in: /content/gdrive/My Drive/ie_course/output/ml_el/test_corpus


### Train entity linking component (ML model)

In [None]:
""" Step 3: Train entity linking model. """

nlp = spacy.load(nlp_dir)

TRAIN_EXAMPLES = []

with open(test_corpus, "rb") as f:
  doc_bin = DocBin().from_disk(test_corpus)
  docs = doc_bin.get_docs(nlp.vocab)
  for doc in docs:
    TRAIN_EXAMPLES.append(Example(nlp(doc.text), doc))

entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)
entity_linker.initialize(lambda: TRAIN_EXAMPLES, nlp=nlp, kb_loader=load_kb(kb_dir))

with nlp.select_pipes(enable=["entity_linker"]):  # train only the entity_linker
  optimizer = nlp.resume_training()
  for itn in range(500):  # 500 iterations takes about a minute to train
    random.shuffle(TRAIN_EXAMPLES)
    batches = minibatch(TRAIN_EXAMPLES, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
    losses = {}
    for batch in batches:
      nlp.update(
        batch,
        drop=0.2,  # prevent overfitting
        losses=losses,
        sgd=optimizer,
      )
    if itn % 50 == 0:
      print(itn, "Losses", losses)  # print the training loss
print(itn, "Losses", losses)

nlp.to_disk(nlp_el_dir)
print()
print(f"Saved NLP pipeline in: {nlp_el_dir}")

0 Losses {'entity_linker': 1.9307233691215515}
50 Losses {'entity_linker': 0.045570552349090576}
100 Losses {'entity_linker': 0.022603243589401245}
150 Losses {'entity_linker': 0.016279876232147217}
200 Losses {'entity_linker': 0.007489040493965149}
250 Losses {'entity_linker': 0.011767923831939697}
300 Losses {'entity_linker': 0.005197629332542419}
350 Losses {'entity_linker': 0.005071923136711121}
400 Losses {'entity_linker': 0.005336344242095947}
450 Losses {'entity_linker': 0.004006430506706238}
499 Losses {'entity_linker': 0.0030239075422286987}

Saved NLP pipeline in: /content/gdrive/My Drive/ie_course/output/ml_el/my_el_nlp


### Evaluate Entity Linking component

In [None]:
""" Step 4: Evaluate the new Entity Linking component by applying it to unseen text. """

nlp = spacy.load(nlp_el_dir)

examples = []

with open(test_corpus, "rb") as f:
  doc_bin = DocBin().from_disk(test_corpus)
  docs = doc_bin.get_docs(nlp.vocab)
  for doc in docs:
    examples.append(Example(nlp(doc.text), doc))


print("RESULTS ON THE DEV SET:")
print()

for example in examples:
  print(example.text)
  print(f"Gold annotation: {example.reference.ents[0].kb_id_}")
  print(f"Predicted annotation: {example.predicted.ents[0].kb_id_}")
  print()

print()
print("RUNNING THE PIPELINE ON UNSEEN TEXT:")
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
print(text)
for ent in doc.ents:
  print(ent.text, ent.label_, ent.kb_id_)
print()

RESULTS ON THE DEV SET:

Emerson's first Wimbledon singles title came in 1964, with a final victory over Fred Stolle.
Gold annotation: Q312545
Predicted annotation: Q312545

Emerson was inducted into the International Tennis Hall of Fame in 1982 and the Sport Australia Hall of Fame in 1986.
Gold annotation: Q312545
Predicted annotation: Q312545

Carlyle in particular was a strong influence on him; Emerson would later serve as an unofficial literary agent in the United States for Carlyle, and in March 1835, he tried to persuade Carlyle to come to America to lecture.
Gold annotation: Q48226
Predicted annotation: NIL

In 1841 Emerson published Essays, his second book, which included the famous essay "Self-Reliance".
Gold annotation: Q48226
Predicted annotation: NIL

Emerson scored his second international goal on 31 March 1999, in a friendly match against Japan in Tokyo, which Brazil won 2-0.
Gold annotation: Q215952
Predicted annotation: Q215952

Emerson made his Brazil debut on 10 Septe