# Automatic corpus building using Wikipedia

## #1. Setup development environment

### Update & import Python modules

In [None]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg # using small model (sm)
!pip install wikipedia
!pip install bs4

# spaCy
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Beautiful Soup
from bs4 import BeautifulSoup

# Wikipedia API
import wikipedia

# general Python modules
import json
import datetime
import requests
from pprint import pprint
import re

### Get access to Firebase and Drive

In [None]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
with open(DRIVE_PATH + "/ie_course/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# create Firestore database instance
firebase_admin.initialize_app(credential)
db = firestore.client()
print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive
Stablished access to Firestore


## #2. Build corpus from Wikipedia

### Method 1: Scrape text from Wikipedia article

In [None]:
# Wikipedia slug of entity
#entity_slug = "Kamala_Harris"
entity_slug = "Joe_Biden"

# parse text from a Wikipedia page, from p elements
r = requests.get(f"https://en.wikipedia.org/wiki/{entity_slug}")
soup = BeautifulSoup(r.text, "html.parser")
p_els = soup.find_all("p")
text = [p.text for p in p_els]

# basic text preprocessing
processed_text = []
for p in text:
  p = p.replace("\n", "") # remove new line chars
  p = p.lstrip() # remove leading blank spaces
  p = p.rstrip() # remove trailing blank space
  if p == "": # ignore empty paragraphs
    continue
  # remove citation numbers [x]
  regex_wikipedia_citation = "(\[\d+(,\s?\d+|\d*-\d+)*\])"
  loops = 0
  while loops < 6:
    loops += 1
    match = re.search(regex_wikipedia_citation, p)
    if match:
      string = match.group()
      p = p.replace(string, "")

  processed_text.append(p)
text = processed_text

# initialize spaCY pipeline and container of sentences
nlp = spacy.load("en_core_web_lg")
sentences_container = []

# split text into sentences
for index, paragraph in enumerate(text):
  # split paragraph in sentences
  sentences = [sent.text for sent in nlp(paragraph).sents]
  sentences_container.extend(sentences)

# save record in JSON file
with open(DRIVE_PATH + f"/ie_course/output/{entity_slug.lower()}_context_texts_1.json", "w", encoding = "utf-8") as f:
  json.dump(sentences_container, f, ensure_ascii = False, indent = 2)
  print(f"Saved {len(sentences_container)} context sentences")

Saved 459 context sentences


### Method 2: Retrieve Wikipedia API (Python module)
Review the official Wikipedia API https://pypi.org/project/wikipedia/

In [None]:
entity_name = "Joe Biden"

# parse text content from Wikipedia article
wikipedia_page = wikipedia.page(entity_name, auto_suggest=False)
text = wikipedia_page.content

# initialize spaCY pipeline and container of sentences
nlp = spacy.load("en_core_web_lg")
sentences_container = []

# split text into sentences
sentences = [sent.text for sent in nlp(text).sents]

# basic text preprocessing
processed_text = []
for sent in sentences:
  sent = sent.replace("\n", "") # remove new line chars
  sent = sent.lstrip() # remove leading blank spaces
  sent = sent.rstrip() # remove trailing blank space
  if sent == "": # ignore empty sentences
    continue
  # remove citation numbers [x]
  regex_wikipedia_citation = "(\[\d+(,\s?\d+|\d*-\d+)*\])"
  loops = 0
  while loops < 6:
    loops += 1
    match = re.search(regex_wikipedia_citation, sent)
    if match:
      string = match.group()
      sent = sent.replace(string, "")

  processed_text.append(sent)
text = processed_text

# save record in JSON file
with open(DRIVE_PATH + f"/ie_course/output/{entity_slug.lower()}_context_texts_2.json", "w", encoding = "utf-8") as f:
  json.dump(text, f, ensure_ascii = False, indent = 2)
  print(f"Saved {len(text)} context sentences")

Saved 445 context sentences


### Create lexicon of entity name/aliases from Wikidata

In [None]:
""" Retrieve entity info from Wikidata and make a list of aliases, by combining label + aliases """

# Wikidata id of entity
#qid = "Q10853588" # Kamala Harris
qid = "Q6279" # Joe Biden

# fetch entity info from the Wikidata API (entity endpoint)
api_url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
r = requests.get(api_url, params={"format": "json"})
# simplify access to root elements of JSON object
entity_info = r.json()["entities"][f"{qid}"]

# get entity aliases
if entity_info["aliases"].get("en"):
  aliases = [a["value"] for a in entity_info["aliases"]["en"]] if entity_info["aliases"].get("en") else []

# create container of gazetteers
gazetteers = aliases

# get entity name
if entity_info["labels"].get("en"):
  gazetteers.append(entity_info["labels"]["en"]["value"])

# get last name
last_name = entity_info["labels"]["en"]["value"].split()[-1]
gazetteers.append(last_name)


pprint(gazetteers)

['Joseph Biden',
 'Joseph R. Biden',
 'Joseph R. Biden Jr.',
 'Joseph R. Biden, Jr.',
 'Biden',
 'Joey Biden',
 'JRB',
 'POTUS 46',
 'Joe R. Biden Jr.',
 'Joseph Robinette Biden',
 'President Biden',
 'President Joe Biden',
 'President Joseph Biden',
 'President Joseph R. Biden',
 'Joseph Robinette Biden Jr.',
 'President Joseph Biden Jr.',
 'President Joseph Robinette Biden',
 'President Joseph R. Biden Jr.',
 'Joe R. Biden',
 'President Joseph Robinette Biden Jr.',
 'Joe Biden Jr.',
 'Dark Brandon',
 'Joe Biden',
 'Biden']


### Create NLP pipeline and add PhraseMatcher

In [None]:
# initialize spaCY phrase matcher (rule-based)
matcher = PhraseMatcher(nlp.vocab, None)
# load issues as gazetteers
patterns = [nlp.make_doc(g) for g in gazetteers]
matcher.add("gazetteers", patterns)

### Use gazetteers to filter contextual sentences

In [None]:
#with open(DRIVE_PATH + f"/ie_course/output/{entity_slug.lower()}_context_texts_1.json") as f:
with open(DRIVE_PATH + f"/ie_course/output/{entity_slug.lower()}_context_texts_2.json") as f:
  text = json.load(f)

main_text_container = []

for index, paragraph in enumerate(text):
  # split paragraph in sentences
  sentences = [sent.text for sent in nlp(paragraph).sents]

  # instance a pipeline to process sentences individually
  disabled_pipelines = ["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]
  for doc in nlp.pipe(sentences, batch_size=50, disable=disabled_pipelines):
    sent = doc.text  # sentence

    # identify gazetteer contained in Doc object (text)
    gazetteers = matcher(doc)
    # convert gazetteers as spans
    gazetteers = [doc[start:end] for _, start, end in gazetteers]
    # filter overlaping matches (spans) - keep gazetteers uniqueness
    filtered_matches = spacy.util.filter_spans(gazetteers)

    # filter sentences with gazetteers occurrences
    sentence_data = []
    if len(filtered_matches):
      sentence_data.append(sent)
      entities = []
      for m in filtered_matches:
        span = doc[m.start:m.end]  # identify span
        matched_gazetteer = span.text
        match_info = (span.start_char, span.end_char, "PERSON")
        entities.append(match_info)
      sentence_data.append({"entities": entities})
      main_text_container.append(sentence_data)

# save record in JSON file
with open(DRIVE_PATH + f"/ie_course/output/{entity_slug.lower()}_ner_corpus.json", "w", encoding = "utf-8") as f:
  json.dump(main_text_container, f, ensure_ascii = False, indent = 2)
  print(f"Saved {len(main_text_container)} annotated sentences")

Saved 277 annotated sentences
