# Named Entity Corrections

In this notebook we test the named entity recognition in the spaCy language model.

Each sentence in each document is reviewed by displaying the named entities in each.

Any errors are noted and a report is produced.

The errors are corrected with an custom pipeline component added to the pipeline.

## Import the files

In [1]:
%%time

import datetime
import os

FileList = ['20010114-Remarks at the National Day of Prayer & Remembrance Service.txt',
            '20010115-First Radio Address following 911.txt',
            '20010117-Address at Islamic Center of Washington, D.C..txt',
           '20010120-Address to Joint Session of Congress Following 911 Attacks.txt',
           '20010911-Address to the Nation.txt',
           '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
           '20011011-911 Pentagon Remembrance Address.txt',
           '20011011-Prime Time News Conference on War on Terror.txt',
           '20011026-Address on Signing the USA Patriot Act of 2001.txt',
           '20011110-First Address to the United Nations General Assembly.txt',
           '20011211-Address to Citadel Cadets.txt',
           '20011211-The World Will Always Remember 911.txt',
           '20020129-First (Official) Presidential State of the Union Address.txt',
           ]
raw = ''

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Speeches/'

binladenpath = os.path.join(filepath, 'Osama bin Laden/')
bushpath = os.path.join(filepath, 'George Bush/')

for f in FileList:
    with open(bushpath + f, 'r') as text:
        raw = raw + text.read()

FileList = ['19960823-OBL Declaration.txt',
            '20011007-OBL Full Warning.txt',
            '20011109-OBL.txt',
            '20021124-OBL Letter to America.txt',
            '20041101-Al Jazeera Speech.txt'
           ]

for f in FileList:
    with open(binladenpath + f, 'r') as text:
        raw = raw + text.read()
        
# with open(os.path.join(filepath, "fulltext.txt"), 'w') as text:
#         text.write(raw)

print('length of doc: ', len(raw))
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

length of doc:  220536
completed at: Apr 15 2020 20:10:52
Wall time: 8.98 ms


# Setup spaCy pipeline

In [2]:
%%time

import spacy
model = 'en_core_web_md'
print('loading: ', model)
nlp = spacy.load(model)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

loading:  en_core_web_md
completed at: Feb 27 2020 15:07:38
Wall time: 19 s


In [1]:
%%time
import os
import json
import datetime


# setup object to store entity corrections, which in turn forms the basis for the custom pipeline component.
named_entity_corrections = {
    
    # inbuilt with spaCy
    "PERSON" : ["usama bin muhammad bin ladin"],
    "NORP" : ["ahlul-sunnah", "infidel", "kuffar", "kafiroon", "kaferoon", "muslim", "da'ees", "ulama", "afghan", "afghans", "Afghans"],
    "FAC"  : ["makka", "ka'ba", "capitol", "guadalcanal", "the world trade center", \
              "the treaty room of the white house"],
    "ORG" : ["bani quraydah", "taliban", "al qaeda", "egyptian islamic jihad", "islamic movement of uzbekistan", "FBI", \
            "republicans", "democrats", "mafia", "crusaders", "mujahideen", "mujahidin", "halliburton", "Jaish-i-Mohammed", \
            "ummah", "quraysh", "bani qainuqa'"],
    "GPE" : ["NATO", "arabian peninsula", "land of the two holy places", "country of the two holy places", "land of the two holy mosques" \
             "country of the two holy mosques", "qana", "assam", "erithria", "chechnia", "makka", "makkah", "qunduz", "mazur-e-sharif", "rafah"],
    "LOC" : ["dar al-islam", "kabal", "iwo jima", "ground zero", "world", "dunya", "Hindu Kush"],
    "PRODUCT" : ["united 93", "global hawk", "flight 93", "predator"],
    "EVENT" : ["september 11th"],
    "WORK_OF_ART" : ["national anthem", "memorandum", "flag", "the marshall plan", "semper fi", "allahu akbar"],
    "LAW" : ["constitution", "anti-ballistic missile treaty", "the treaty of hudaybiyyah", "kyoto agreement", "	Human Rights"],
    "LANGUAGE" : [],
    "DATE" : ["shawwaal", "muharram", "rashidoon"],
    "TIME" : [],
    "PERCENT" : [],
    "MONEY" : ["riyal"],
    "QUANTITY" : [],
    "ORDINAL" : [],
    "CARDINAL" : [],
    
    ##user defined
    "DIRECTVIOLENCE" : ["gulf war"],
    "STRUCTURALVIOLENCE" : ["cold war", "war on terror"],
    "RELIGION" : ["islam", "christianity"],
    "DEITY" : ["hubal", "god", "Lord", "almighty"],
    "RELIGIOUSFIGURE" : ["jesus", "abraham", "jibreel", "ishmael", "isaac", "allah", "imraan", "hud", "aal-imraan", "al-ma'ida", \
                         "baqarah", "an-nisa", "al-ahzab", "shu'aib", "al'iz ibn abd es-salaam", \
                        "ibn taymiyyah", "an-noor", "majmoo' al fatawa", "luqman", "al-masjid an-nabawy", \
                        "abd ur-rahman ibn awf", "abu jahl", "aal imraan", "the messenger of allah", \
                        "Saheeh Al-Jame", "at-tirmidhi", "at-taubah", "haroon ar-rasheed", "ameer-ul-mu'mineen", \
                        "assim bin thabit", "moses", "satan"],
    "RELIGIOUSLAW" : ["halal", "haram", "shari'a", "mushrik", "fatwa", "fatwas", "shariah", "shari'ah"],
    "RELIGIOUSCONFLICT" : ["jihad", "crusade"],
    "RELIGIOUS_WORK_OF_ART" : ["koranic", "Quran", "quran", "Koran", "as-sayf", "taghut", "torah", "psalm", "qiblah", "allahu akbar"],
    "RELIGIOUS_EVENT" : ["Hegira", "the Day of Judgment"],
    "RELIGIOUSENTITY" : ["MECCA"],
    "RELIGIOUS_FAC" : ["kaa'ba", "ka'bah"],
}

filepath = r'C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset'

## create file to store entity corrections
with open(os.path.join(filepath, "named_entity_corrections.json"), "wb") as f:
    f.write(json.dumps(named_entity_corrections).encode("utf-8"))
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

completed at: Jul 23 2020 10:40:52
Wall time: 2 ms


In [None]:
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
from spacy.tokens import Span
from spacy.pipeline import merge_noun_chunks
import pandas as pd

# create entity ruler for custom pipeline component
entities = EntityRuler(nlp, overwrite_ents=True, phrase_matcher_attr = "LOWER")

for key, value in named_entity_corrections.items():
    pattern = {"label" : key, "pattern" : [{"LOWER" : {"IN" : value}}]}, #, "POS" : {"IN": ["PROPN", "NOUN"]}
    entities.add_patterns(pattern)

# modify spaCy pipeline with custom component
    
import json
from spacy.pipeline import merge_entities
from spacy.strings import StringStore

for pipe in nlp.pipe_names:
    if pipe not in ['tagger', "parser", "ner"]:
        nlp.remove_pipe(pipe)
        
for key in named_entity_corrections.keys():
    nlp.vocab.strings.add(key)
        
nlp.add_pipe(entities, after = "ner")
# nlp.add_pipe(ent_matcher, before = "ner")
nlp.add_pipe(merge_entities, last = True)
#nlp.add_pipe(merge_noun_chunks, last = True)

print("Pipeline Components")
print(' | '.join(nlp.pipe_names))

print("processing doc")
doc = nlp(raw)
print("doc processed")

print('-----')
print("current corrections")
print('-----')
#print out the corrections
for label, terms in named_entity_corrections.items():
    if len(terms) > 0:
        patterns = [text.upper() for text in terms]
        print(label, patterns)
        
#         patterns = [nlp.make_doc(text) for text in pattern["pattern"]] # -- used for PhraseMatcher
#         self.matcher.add(pattern["label"], None, *patterns)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

## Review Each Sentence to Check for Corrections

Iterate through each sentence to review the named entities.

Check the named entity against the wikipedia entry.

Correct as required.

In [None]:
import wikipediaapi
import pandas as pd
import os

def get_wikisummary(token):

    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(token)

    if page_py.exists():
        return (page_py.title, " ".join(str(nlp(page_py.summary, disable = ['tokenizer', 'ner']).sents.__next__()).split()))
    else:
        return ('no wiki reference', 'no wiki reference')


filepath = "C:/Users/Steve/University of Southampton/CulturalViolence/KnowledgeBases/Experiment 2 - Testing Named Entity Recognition in the spaCy models/"

if input("Restart from fresh (y/n): ").lower() == 'n':
    filename = input('existing filename: ')

    with open(os.path.join(filepath, filename), 'r') as fp:
        corrections_dict = json.load(fp)
        
    with open(os.path.join(filepath, "seen_tokens.json"), 'r') as fp:
        seen_tokens = {key for key in json.load(fp)}

else:
    corrections_dict = dict()
    seen_tokens = set()
    
### !!! The bin laden object here needs to be changed.

for i, doc in enumerate(binladen):

    for token in binladen.speeches_nlp[i].text_nlp:
        entries_dict = dict()

        if token.ent_type_ and \
        token.ent_type_ not in ['ORATOR', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'] and \
        token.text not in seen_tokens:

            seen_tokens.add(token.text)
            
            with open(os.path.join(filepath, "seen_tokens.json"), "wb") as f:
                    f.write(json.dumps(dict.fromkeys(seen_tokens)).encode("utf-8"))

            wikientry = get_wikisummary(token.text)
            entries_dict[token.text] = [token.ent_type_, wikientry[0], wikientry[1]]
            entries_dict['sentence'] = ['', '', token.sent]
            displacy.render(token.sent, style = 'ent')
            pd.set_option('display.max_colwidth', -1)
            
            display(pd.DataFrame.from_dict(entries_dict, orient='index', columns = ['ent_type_', 'wiki_title', 'summary'])
                .style.set_properties(**{'text-align': 'left'})
                .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

            if input('correct y/n ').lower() == 'n':
                corrections_dict[token.text] = {
                    'original ent_type_' : token.ent_type_, 
                    'wiki_title': wikientry[0], 
                    'wiki_summary' : wikientry[1],
                    'correction' : input('correct type')
                }

                ### check wiki entry and correct with manual entry if required
                
                answer = 'n'
                while answer == 'n':
                    display(pd.DataFrame.from_dict(corrections_dict[token.text], orient = "index"))
                    
                    answer = input('correct wiki entry? (y/n)').lower()
                    
                    if answer != 'n':
                        break
                                
                    corrections_dict[token.text] = {
                        'original ent_type_' : token.ent_type_, 
                        'wiki_title': input("wiki_title: "), 
                        'wiki_summary' : input("wiki_summary: "),
                        'correction' : input("correct type: ")
                    }
                    
                with open(os.path.join(filapth, "binladen_entitycorrections.json"), "wb") as f:
                    f.write(json.dumps(corrections_dict).encode("utf-8"))

print('complete')

## Create PDF Report for Each Orator

In [None]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML

filepath = "C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Experiment 2 - Testing Named Entity Recognition in the spaCy models/"

with open(os.path.join(filepath, "binladen_entitycorrections.json"), 'r') as fp:
    questions = json.load(fp)

env = Environment(loader=FileSystemLoader(searchpath=filepath))
template = env.get_template('myreport.html')
  
table = pd.DataFrame.from_dict(questions).T

template_vars = {"title" : "bin Laden Entity Corrections",
                 "islamic_terms": table.to_html()}
    
html_out = template.render(template_vars)
HTML(string=html_out).write_pdf(os.path.join(filepath, "binladen_entitycorrections.pdf"), stylesheets=[os.path.join(filepath, "style.css")])    
    
pd.set_option('expand_frame_repr', False)
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)
    
display(pd.DataFrame.from_dict(questions).T
        .style.set_properties(**{'text-align': 'left'})
        .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

print(f'completed at {str(datetime.datetime.now())}') #1220