# Named Entity Corrections

In this notebook we test the named entity recognition in the spaCy language model.

Each sentence in each document is reviewed by displaying the named entities in each.

Any errors are noted and a report is produced.

The errors are corrected with an custom pipeline component added to the pipeline.

## Import the files

In [35]:
%%time

import datetime
import os

def get_dataset_dirpath(cwd):
    
    two_up = os.path.dirname(os.path.dirname(cwd))
    
    return os.path.join(two_up, 'dataset')

dataset_dirpath = get_dataset_dirpath(os.getcwd())

binladenpath = os.path.join(dataset_dirpath, 'Osama bin Laden/')
bushpath = os.path.join(dataset_dirpath, 'George Bush/')


Bush_FileList = [
    '20010914-Remarks at the National Day of Prayer & Remembrance Service.txt',
    '20010915-First Radio Address following 911.txt',
    '20010917-Address at Islamic Center of Washington, D.C..txt',
    '20010920-Address to Joint Session of Congress Following 911 Attacks.txt',
    '20010911-911 Address to the Nation.txt',
    '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
    '20011011-911 Pentagon Remembrance Address.txt',
    '20011011-Prime Time News Conference on War on Terror.txt',
    '20011026-Address on Signing the USA Patriot Act of 2001.txt',
    '20011110-First Address to the United Nations General Assembly.txt',
    '20011211-Address to Citadel Cadets.txt',
    '20011211-The World Will Always Remember 911.txt',
    '20020129-First (Official) Presidential State of the Union Address.txt'
]

binLaden_FileList = [
    '19960823-Declaration of Jihad Against the Americans Occupying the Land of the Two Holiest Sites.txt',
    '20010107-Osama Bin Laden Letter Calling For Global Islamic State.txt',
#     '20011109-Bin Laden\'s Statement The Sword Fell.txt',
    '20021124-OBL Letter to America.txt',
    '20041101-Al Jazeera Speech.txt'
           ]

records = {
    "bush": {"filepath": bushpath, "texts": Bush_FileList},
    "binladen": {"filepath": binladenpath, "texts": binLaden_FileList}
}

for orator, record in records.items():

    raw = ""
    for text in record["texts"]:
        with open(os.path.join(record["filepath"], text), 'r') as text:
            raw = raw + text.read()
                         
    print(f'{orator} doc length = {len(raw)}')
    
    fulltext_path = os.path.join(record["filepath"], 'fulltext.txt')
        
    with open(fulltext_path, 'w') as text:
            text.write(raw)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

bush doc length = 111934
binladen doc length = 86384
completed at: Aug 15 2022 14:41:08
CPU times: user 2.38 ms, sys: 3.3 ms, total: 5.67 ms
Wall time: 8.07 ms


# Setup spaCy pipeline

In [2]:
%%time

import spacy
import datetime
model = 'en_core_web_md'
print('loading: ', model)
nlp = spacy.load(model)
nlp.add_pipe("merge_entities")
nlp.add_pipe("entityfishing", config={"extra_info": True})

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

loading:  en_core_web_md
completed at: Aug 15 2022 14:03:12
CPU times: user 2.83 s, sys: 513 ms, total: 3.35 s
Wall time: 6.95 s


In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)


text = """This group and its leader -- a person named Usama bin Laden -- are linked to many other organizations in different countries, including the Egyptian Islamic Jihad and the Islamic Movement of Uzbekistan"""
doc = nlp(text)

display(pd.DataFrame(
    [
        {
            'Named Entity': ent.root.text,
            'Label': ent.root.ent_type_,
            'Description': spacy.explain(ent.root.ent_type_),
            'Wikidata ID': ent._.kb_qid,
            'Nerd Score': ent._.nerd_score,
            'Normal term': ent._.normal_term, 
#             'Wikidata': ent._.description,
#             'Other IDs': ent._.other_ids

        }
     for ent in doc.ents
    ]
))

display(pd.DataFrame(
    [
        {
#             'Named Entity': ent.root.text,
#             'Label': ent.root.ent_type_,
#             'Description': spacy.explain(ent.root.ent_type_),
#             'Wikidata ID': ent._.kb_qid,
#             'Nerd Score': ent._.nerd_score,
            'Normal term': ent._.normal_term, 
            'Wikidata': ent._.description,
#             'Other IDs': ent._.other_ids

        }
     for ent in doc.ents
    ]
))

Unnamed: 0,Named Entity,Label,Description,Wikidata ID,Nerd Score,Normal term
0,Usama bin Laden,PERSON,"People, including fictional",Q1317,0.9258,Osama bin Laden
1,the Egyptian Islamic Jihad,ORG,"Companies, agencies, institutions, etc.",Q310214,0.3634,Egyptian Islamic Jihad
2,the Islamic Movement of Uzbekistan,ORG,"Companies, agencies, institutions, etc.",,,


Unnamed: 0,Normal term,Wikidata
0,Osama bin Laden,"'''Osama bin Mohammed bin Awad bin Laden''' (March 10, 1957 – May 2, 2011), also [[transliterated]] as '''Usama bin Ladin''', was a [[Saudi Arabian]] [[terrorist]] and founder of the [[Pan-Islamic]] militant organization . The group is designated as a [[List of designated terrorist groups|terrorist group]] by the [[United Nations Security Council]], the [[NATO|North Atlantic Treaty Organization]] (NATO), the [[European Union]], and various countries. Under bin Laden's leadership, al-Qaeda was responsible for the [[September 11 attacks]] in the [[United States]], and many [[Timeline of al-Qaeda attacks|other mass-casualty attacks]] worldwide."
1,Egyptian Islamic Jihad,"The '''Egyptian Islamic Jihad''' ('''EIJ''', ), formerly called simply '''Islamic [[Jihad]]''' ( and ""Liberation Army for Holy Sites""), originally referred to as '''al-Jihad''', and then '''the Jihad Group''', or '''the Jihad Organization''', is an Egyptian [[Islamist]] [[terrorist group|group]] active since the late 1970s. It is under worldwide embargo by the [[United Nations]] as an affiliate of [[Al-Qaeda]]. It is also banned by several individual governments worldwide. The group is a Proscribed Organisation in the [[United Kingdom]] under the [[Terrorism Act 2000]]."
2,,


In [40]:
import os
filename = """20010920-Address to Joint Session of Congress Following 911 Attacks.txt"""
filepath = os.path.join(records["bush"]["filepath"], filename)
with open(filepath, 'r') as t:
    text = t.read()
    
doc = nlp(text)
type(doc._.coref_resolved)

str

In [15]:
%%time
import os
import json
import datetime


# setup object to store entity corrections, which in turn forms the basis for the custom pipeline component.
named_entity_corrections = {
    
    # inbuilt with spaCy
    "PERSON" : ["usama bin muhammad bin ladin"],
    "NORP" : ["ahlul-sunnah", "infidel", "kuffar", "kafiroon", "kaferoon", "muslim", "da'ees", "ulama", "afghan", "afghans", "Afghans"],
    "FAC"  : ["makka", "ka'ba", "capitol", "guadalcanal", "the world trade center", \
              "the treaty room of the white house"],
    "ORG" : ["bani quraydah", "taliban", "al qaeda", "egyptian islamic jihad", "islamic movement of uzbekistan", "FBI", \
            "republicans", "democrats", "mafia", "crusaders", "mujahideen", "mujahidin", "halliburton", "Jaish-i-Mohammed", \
            "ummah", "quraysh", "bani qainuqa'"],
    "GPE" : ["NATO", "arabian peninsula", "land of the two holy places", "country of the two holy places", "land of the two holy mosques" \
             "country of the two holy mosques", "qana", "assam", "erithria", "chechnia", "makka", "makkah", "qunduz", "mazur-e-sharif", "rafah"],
    "LOC" : ["dar al-islam", "kabal", "iwo jima", "ground zero", "world", "dunya", "Hindu Kush"],
    "PRODUCT" : ["united 93", "global hawk", "flight 93", "predator"],
    "EVENT" : ["september 11th"],
    "WORK_OF_ART" : ["national anthem", "memorandum", "flag", "the marshall plan", "semper fi", "allahu akbar"],
    "LAW" : ["constitution", "anti-ballistic missile treaty", "the treaty of hudaybiyyah", "kyoto agreement", "	Human Rights"],
    "LANGUAGE" : [],
    "DATE" : ["shawwaal", "muharram", "rashidoon"],
    "TIME" : [],
    "PERCENT" : [],
    "MONEY" : ["riyal"],
    "QUANTITY" : [],
    "ORDINAL" : [],
    "CARDINAL" : [],
    
    ##user defined
    "DIRECTVIOLENCE" : ["gulf war"],
    "STRUCTURALVIOLENCE" : ["cold war", "war on terror"],
    "RELIGION" : ["islam", "christianity"],
    "DEITY" : ["hubal", "god", "Lord", "almighty"],
    "RELIGIOUSFIGURE" : ["jesus", "abraham", "jibreel", "ishmael", "isaac", "allah", "imraan", "hud", "aal-imraan", "al-ma'ida", \
                         "baqarah", "an-nisa", "al-ahzab", "shu'aib", "al'iz ibn abd es-salaam", \
                        "ibn taymiyyah", "an-noor", "majmoo' al fatawa", "luqman", "al-masjid an-nabawy", \
                        "abd ur-rahman ibn awf", "abu jahl", "aal imraan", "the messenger of allah", \
                        "Saheeh Al-Jame", "at-tirmidhi", "at-taubah", "haroon ar-rasheed", "ameer-ul-mu'mineen", \
                        "assim bin thabit", "moses", "satan"],
    "RELIGIOUSLAW" : ["halal", "haram", "shari'a", "mushrik", "fatwa", "fatwas", "shariah", "shari'ah"],
    "RELIGIOUSCONFLICT" : ["jihad", "crusade"],
    "RELIGIOUS_WORK_OF_ART" : ["koranic", "Quran", "quran", "Koran", "as-sayf", "taghut", "torah", "psalm", "qiblah", "allahu akbar"],
    "RELIGIOUS_EVENT" : ["Hegira", "the Day of Judgment"],
    "RELIGIOUSENTITY" : ["MECCA"],
    "RELIGIOUS_FAC" : ["kaa'ba", "ka'bah"],
}

dataset_dirpath = get_dataset_dirpath(os.getcwd())
corrections_dirpath = os.path.join(dataset_dirpath, "named_entity_corrections.json")
json_object = json.dumps(named_entity_corrections, indent=4)

## create file to store entity corrections    
with open(corrections_dirpath, 'w') as f:
    f.write(json_object)
    
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')


completed at: Aug 04 2022 14:01:07
CPU times: user 1.86 ms, sys: 4.12 ms, total: 5.99 ms
Wall time: 660 ms


In [24]:

# create entity ruler for custom pipeline component
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("entity_ruler")
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("merge_entities")
    
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})

for key, value in named_entity_corrections.items():
    pattern = {"label" : key, "pattern" : [{"LOWER" : {"IN" : value}}]}, #, "POS" : {"IN": ["PROPN", "NOUN"]}
    ruler.add_patterns(pattern)
    
nlp.add_pipe("merge_entities", last = True)

print("Pipeline Components")
print(' | '.join(nlp.pipe_names))

print("processing doc")
doc = nlp(raw)
print("doc processed")

print('-----')
print("current corrections")
print('-----')
#print out the corrections
for label, terms in named_entity_corrections.items():
    if len(terms) > 0:
        patterns = [text.upper() for text in terms]
        print(label, patterns)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

Pipeline Components
tagger | parser | ner | entity_ruler | merge_entities
processing doc
doc processed
-----
current corrections
-----
PERSON ['USAMA BIN MUHAMMAD BIN LADIN']
NORP ['AHLUL-SUNNAH', 'INFIDEL', 'KUFFAR', 'KAFIROON', 'KAFEROON', 'MUSLIM', "DA'EES", 'ULAMA', 'AFGHAN', 'AFGHANS', 'AFGHANS']
FAC ['MAKKA', "KA'BA", 'CAPITOL', 'GUADALCANAL', 'THE WORLD TRADE CENTER', 'THE TREATY ROOM OF THE WHITE HOUSE']
ORG ['BANI QURAYDAH', 'TALIBAN', 'AL QAEDA', 'EGYPTIAN ISLAMIC JIHAD', 'ISLAMIC MOVEMENT OF UZBEKISTAN', 'FBI', 'REPUBLICANS', 'DEMOCRATS', 'MAFIA', 'CRUSADERS', 'MUJAHIDEEN', 'MUJAHIDIN', 'HALLIBURTON', 'JAISH-I-MOHAMMED', 'UMMAH', 'QURAYSH', "BANI QAINUQA'"]
GPE ['NATO', 'ARABIAN PENINSULA', 'LAND OF THE TWO HOLY PLACES', 'COUNTRY OF THE TWO HOLY PLACES', 'LAND OF THE TWO HOLY MOSQUESCOUNTRY OF THE TWO HOLY MOSQUES', 'QANA', 'ASSAM', 'ERITHRIA', 'CHECHNIA', 'MAKKA', 'MAKKAH', 'QUNDUZ', 'MAZUR-E-SHARIF', 'RAFAH']
LOC ['DAR AL-ISLAM', 'KABAL', 'IWO JIMA', 'GROUND ZERO', 'WORLD'

## Review Each Sentence to Check for Corrections

Iterate through each sentence to review the named entities.

Check the named entity against the wikipedia entry.

Correct as required.

In [None]:
import wikipediaapi
from spacy import displacy
import pandas as pd
import os

def get_wikisummary(token):

    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(token)

    if page_py.exists():
        return (page_py.title, " ".join(str(nlp(page_py.summary, disable = ['tokenizer', 'ner']).sents.__next__()).split()))
    else:
        return ('no wiki reference', 'no wiki reference')


filepath = os.getcwd()

if input("Restart from fresh (y/n): ").lower() == 'n':
    if input("Bush or bin Laden").lower() == 'bush':
        filename = 'bush_entitycorrections.json'
    if input("Bush or bin Laden").lower() == 'bin laden':
        filename = 'binladen_entitycorrections.json'

    with open(os.path.join(filepath, filename), 'r') as fp:
        corrections_dict = json.load(fp)
        
    with open(os.path.join(filepath, "seen_tokens.json"), 'r') as fp:
        seen_tokens = {key for key in json.load(fp)}

else:
    corrections_dict = dict()
    seen_tokens = set()
    
# iterate through each orator
for record in records:
    
    # get the text for each orator
    filepath = os.path.join(record["filepath"], 'fulltext.txt')
    with open(filepath, 'r') as t:
            text =  t.read()

    #iterate through the spaCy doc object to evaluate each token
    for token in nlp(text):
        
        entries_dict = dict()

        if token.pos_ == "PROPN" or token.ent_type_:

            seen_tokens.add(token.text)

            with open(os.path.join(os.getcwd(), "seen_tokens.json"), "wb") as f:
                    f.write(json.dumps(dict.fromkeys(seen_tokens)).encode("utf-8"))

            wikientry = get_wikisummary(token.text)
            entries_dict[token.text] = [token.ent_type_, wikientry[0], wikientry[1]]
            entries_dict['sentence'] = ['', '', token.sent]
            displacy.render(token.sent, style = 'ent')
            pd.set_option('display.max_colwidth', -1)

            display(pd.DataFrame.from_dict(entries_dict, orient='index', columns = ['ent_type_', 'wiki_title', 'summary'])
                .style.set_properties(**{'text-align': 'left'})
                .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

            if input('correct y/n ').lower() == 'n':
                corrections_dict[token.text] = {
                    'original ent_type_' : token.ent_type_, 
                    'wiki_title': wikientry[0], 
                    'wiki_summary' : wikientry[1],
                    'correction' : input('correct type')
                }

                ### check wiki entry and correct with manual entry if required

                answer = 'n'
                while answer == 'n':
                    display(pd.DataFrame.from_dict(corrections_dict[token.text], orient = "index"))

                    answer = input('correct wiki entry? (y/n)').lower()

                    if answer != 'n':
                        break

                    corrections_dict[token.text] = {
                        'original ent_type_' : token.ent_type_, 
                        'wiki_title': input("wiki_title: "), 
                        'wiki_summary' : input("wiki_summary: "),
                        'correction' : input("correct type: ")
                    }

                with open(os.path.join(filapth, "binladen_entitycorrections.json"), "wb") as f:
                    f.write(json.dumps(corrections_dict).encode("utf-8"))

print('complete')

## Create PDF Report for Each Orator

In [None]:
import json
import pandas as pd
from jinja2 import Environment, FileSystemLoader
from weasyprint import HTML

filepath = "C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Experiment 2 - Testing Named Entity Recognition in the spaCy models/"

with open(os.path.join(filepath, "binladen_entitycorrections.json"), 'r') as fp:
    questions = json.load(fp)

env = Environment(loader=FileSystemLoader(searchpath=filepath))
template = env.get_template('myreport.html')
  
table = pd.DataFrame.from_dict(questions).T

template_vars = {"title" : "bin Laden Entity Corrections",
                 "islamic_terms": table.to_html()}
    
html_out = template.render(template_vars)
HTML(string=html_out).write_pdf(os.path.join(filepath, "binladen_entitycorrections.pdf"), stylesheets=[os.path.join(filepath, "style.css")])    
    
pd.set_option('expand_frame_repr', False)
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)
    
display(pd.DataFrame.from_dict(questions).T
        .style.set_properties(**{'text-align': 'left'})
        .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

print(f'completed at {str(datetime.datetime.now())}') #1220