# Named Entity Recognition on AI Wiki text

Arthor: Crystal

Major package used: Stanza 

In [2]:
import pandas as pd
import numpy as np
import json
import glob
import time


# web scrapping
import requests as r
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#name entity recognition
import stanza
stanza.download('en') # download English model

#spacy
import spacy
import nltk
from nltk.corpus import stopwords
#spacy.load("en_core_web_sm")

#visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
#warnings.simplefilter('always')
warnings.filterwarnings("ignore", category=DeprecationWarning)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.1.json:   0%|   …

2021-06-27 17:38:42 INFO: Downloading default packages for language: en (English)...
2021-06-27 17:38:43 INFO: File exists: /home/zz3hs/stanza_resources/en/default.zip.
2021-06-27 17:38:48 INFO: Finished downloading models and saved to /home/zz3hs/stanza_resources.


## Read in named entity data

In [16]:
df = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text_entity.csv') 

In [17]:
df

Unnamed: 0.1,Unnamed: 0,end_char,start_char,text,type
0,0,1046.0,1040.0,Tesler,PERSON
1,1,1642.0,1638.0,1955,DATE
2,2,1862.0,1857.0,Alpha,PERSON
3,3,1906.0,1902.0,2015,DATE
4,4,2656.0,2653.0,AGI,ORG
...,...,...,...,...,...
390,390,62078.0,62058.0,Karel Čapek's R.U.R.,WORK_OF_ART
391,391,62133.0,62090.0,A.I. Artificial Intelligence and Ex Machina,WORK_OF_ART
392,392,62191.0,62138.0,well as the novel Do Androids Dream of Electri...,WORK_OF_ART
393,393,62211.0,62197.0,Philip K. Dick,PERSON


In [4]:
#de-duplicate text
df_unique = df.drop_duplicates(subset="text")
len(df_unique)

281

In [5]:
df_unique["type"].value_counts(sort=True)

ORG            69
PERSON         58
DATE           58
WORK_OF_ART    29
CARDINAL       17
GPE            13
PRODUCT        10
LOC             6
ORDINAL         5
PERCENT         5
NORP            4
MONEY           3
TIME            2
LANGUAGE        1
QUANTITY        1
Name: type, dtype: int64

##### Named entities that we are not considering:
- Cardinal, just numbers, which wer are not concerned about that could be linked to another Wiki page
- Product, such as Skype, kINECT, Xobx 360
- Percent
- Ordinal
- Money
- Time
- Quantity
- Language

#### NER: Organization

In [6]:
org_ls = df_unique[df_unique["type"] == "ORG"].text.tolist()
org_ls

['AGI',
 'AI',
 "Alan Turing's",
 'Dartmouth College',
 'Attendees Allen Newell',
 'CMU)',
 'Herbert Simon',
 'MIT)',
 'Arthur Samuel',
 'IBM',
 'the Department of Defense',
 'Congress',
 'ANN',
 'Lee Sedol',
 'AlphaZero',
 'MuZero',
 'Bloomberg',
 'Microsoft',
 'Facebook',
 'DeepMind',
 'SVM',
 'Occam',
 'Moravec',
 'Cyc',
 'Japanese Fifth Generation Computer Systems',
 "W. Grey Walter's",
 'the Johns Hopkins Beast',
 'the Teleological Society',
 'Princeton University',
 'the Ratio Club',
 'Carnegie Mellon University',
 'Stanford',
 'MIT',
 'Allen Newell',
 'Simon and Newell',
 'Stanford (SAIL',
 'Seymour Papert',
 'CMU',
 'Edward Feigenbaum)',
 'Grey',
 'GOFAI',
 'Google Search',
 'Siri',
 'Deepfakes',
 'ZDNet',
 'Computer Power and Human Reason',
 'AMA',
 'AMAs',
 "California's Institute",
 'Strong AI',
 'Aldous Huxley',
 'European Union',
 'Economist',
 'OECD',
 'Ford',
 'United Nations',
 'the Future of Life Institute',
 'ProPublica',
 'COMPAS',
 'SpaceX',
 'Peter Thiel',
 'Amazon

#### NER: Date

In [7]:
date_ls = df_unique[df_unique["type"] == "DATE"].text.tolist()
date_ls

['1955',
 '2015',
 'the twenty-first century',
 '1943',
 '1956',
 '1954',
 '1959',
 'the middle of the 1960s',
 'twenty years',
 '1974',
 'next few years',
 'the early 1980s',
 '1985',
 '1987',
 'the 1980s',
 '1989',
 'the late 1990s',
 'early 21st century',
 '11 May 1997',
 '2011',
 '2012',
 'March 2016',
 'the 2017',
 'two years',
 'year',
 '2017',
 '2016',
 '2020',
 '10,000 days',
 'the late 1980s',
 '1990s',
 '2019',
 '1988',
 'millions of years',
 '1984',
 '1982–1992',
 '2010s',
 'the 1940s and 1950s',
 '1960',
 '1980s',
 '1960s',
 'the 1960s and the 1970s',
 '1970',
 '1950s',
 'election year',
 '2005',
 'the next few hundred years',
 '2010',
 'the year 2029',
 '1863',
 '1998',
 'February 2020',
 'January 2015',
 '2001',
 '1968',
 '1999',
 '1951',
 '1986']

#### NER: Person

In [8]:
person_ls = df_unique[df_unique["type"] == "PERSON"].text.tolist()
person_ls

['Tesler',
 'Alpha',
 'Karel Čapek',
 'Turing',
 'John McCarthy',
 'Norbert Wiener',
 'Marvin Minsky',
 'James Lighthill',
 'A. Mead',
 'Mohammed Ismail',
 'Moore',
 'Garry Kasparov',
 'Watson',
 'Brad Rutter',
 'Ken Jennings',
 'Ke Jie',
 "Deep Blue's",
 'Murray Campbell',
 'Jack Clark',
 'Clark',
 'John Haugeland',
 'Roger Schank',
 'David Rumelhart',
 'Markov',
 'Joseph Weizenbaum',
 'Weizenbaum',
 'Wendell Wallach',
 'Wallach',
 'Charles T. Rubin',
 'David Chalmers',
 'Jerry Fodor',
 'Hilary Putnam',
 'John Searle',
 'Searle',
 'Vernor Vinge',
 'Ray Kurzweil',
 'Kevin Warwick',
 'Robert Ettinger',
 "Samuel Butler's",
 'George Dyson',
 'Pricewaterhouse',
 'Michael Osborne',
 'Carl Benedikt Frey',
 'Martin Ford',
 "Andrew Yang's",
 'Irakli Beridze',
 'Stephen Hawking',
 'Bill Gates',
 'Yuval Noah Harari',
 'Elon Musk',
 'Hawking',
 'Mark Hurd',
 'Mark Zuckerberg',
 'Isaac Asimov',
 'Asimov',
 'George Lucas',
 'Philip K. Dick',
 'Dick']

#### NER: WORK_OF_ART

In [9]:
work_of_art_ls = df_unique[df_unique["type"] == "WORK_OF_ART"].text.tolist()
work_of_art_ls

["Mary Shelley's Frankenstein",
 'Turing-complete "',
 'Analog VLSI Implementation',
 'Future of Go Summit',
 'Moral Machines  For Wallach',
 'part of the research landscape of artificial intelligence as guided by its two central questions which he identifies as "Does Humanity Want Computers Making Moral Decisions"',
 'Symposium on Machine Ethics',
 'Ethics',
 'the AAAI Fall 2005 Symposium on Machine Ethics',
 'I think',
 'Plug & Pray',
 'Star Trek Next Generation, with the character of Commander Data',
 'Edward Fredkin argues that "artificial intelligence is the next stage in evolution"',
 'Darwin among the Machines"',
 'book of the same name',
 'Nick Bostrom',
 'Human Compatible, AI researcher Stuart J. Russell',
 'A Space Odyssey',
 'The Terminator',
 'The Matrix',
 'Gort from The Day the Earth Stood Still',
 'Bishop from Aliens',
 'the Three Laws of Robotics',
 'Ghost in the Shell',
 "Hajime Sorayama's Sexy Robots",
 'Japan depicting the actual organic human form with lifelike musc

#### NER: GPE

In [10]:
gpe_ls = df_unique[df_unique["type"] == "GPE"].text.tolist()
gpe_ls

['U.S.',
 'US',
 'Japan',
 'U.S',
 'China',
 'Denver',
 'England',
 'Edinburgh',
 'the United States',
 'Russia',
 'the United Kingdom',
 'Republic',
 'Bostrom']

#### NER: LOC

In [11]:
loc_ls = df_unique[df_unique["type"] == "LOC"].text.tolist()
loc_ls

['East', 'San Francisco', 'West', 'Europe', 'Rodney Brooks', 'Earth']

#### NER: NORP

In [12]:
norp_ls = df_unique[df_unique["type"] == "NORP"].text.tolist()
norp_ls

['British', 'Bayesian', 'Americans', 'Chinese']

## Combine all the named entities

In [14]:
ls = org_ls + date_ls + person_ls + work_of_art_ls + gpe_ls + loc_ls  + norp_ls