# Named Entity Recognition on AI Wiki text

Arthor: Crystal

Major package used: Stanza 

In [3]:
import pandas as pd
import numpy as np
import json
import glob
import time


# web scrapping
import requests as r
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#name entity recognition
import stanza
stanza.download('en') # download English model

#spacy
import spacy
import nltk
from nltk.corpus import stopwords
#spacy.load("en_core_web_sm")

#visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
#warnings.simplefilter('always')
warnings.filterwarnings("ignore", category=DeprecationWarning)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.1.json:   0%|   …

2021-06-27 17:27:21 INFO: Downloading default packages for language: en (English)...
2021-06-27 17:27:22 INFO: File exists: /home/zz3hs/stanza_resources/en/default.zip.
2021-06-27 17:27:27 INFO: Finished downloading models and saved to /home/zz3hs/stanza_resources.


## Read in AI Wiki text

In [5]:
#texts = open("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text.txt", "r")
with open("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text.txt") as f:
    contents = f.read()

In [6]:
len(contents)
print("First paragraph:",contents[1:1000])

print("Last paragraph:", contents[61100:61759])

First paragraph: oneNoneArtificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals, which involves consciousness and emotionality. The distinction between the former and the latter categories is often revealed by the acronym chosen. 'Strong' AI is usually labelled as artificial general intelligence (AGI) while attempts to emulate 'natural' intelligence have been called artificial biological intelligence (ABI). Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of achieving its goals.  Colloquially, the term "artificial intelligence" is often used to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving". 
As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition

In [7]:
type(contents)

str

## Name entity recognition

In [8]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner') # initialize English neural pipeline, tokenize and named entity recognition

2021-06-27 17:32:00 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-06-27 17:32:00 INFO: Use device: cpu
2021-06-27 17:32:00 INFO: Loading: tokenize
2021-06-27 17:32:00 INFO: Loading: ner
2021-06-27 17:32:01 INFO: Done loading processors!


In [134]:
#Document will contain a list of Sentences, and the Sentences will contain lists of Tokens. 
doc = nlp(contents)

In [135]:
#extract named entities
entities = doc.entities

In [136]:
len(entities)

395

In [137]:
type(entities)

list

In [138]:
entities[0]

{
  "text": "Tesler",
  "type": "PERSON",
  "start_char": 1040,
  "end_char": 1046
}

In [139]:
df = pd.DataFrame()
for entity in entities:
    df = df.append({
        "text": entity.text,
        'type': entity.type,
        'start_char': entity.start_char,
        'end_char': entity.end_char
        }, ignore_index = True)

### Save name entity recognition on AI Wiki text as a panda dataframe

In [140]:
#df.to_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text_entity.csv', index = True) #export csv

In [9]:
#de-duplicate text
df_unique = df.drop_duplicates(subset="text")
len(df_unique)

NameError: name 'df' is not defined

In [163]:
df_unique["type"].value_counts(sort=True)

ORG            69
DATE           58
PERSON         58
WORK_OF_ART    29
CARDINAL       17
GPE            13
PRODUCT        10
LOC             6
ORDINAL         5
PERCENT         5
NORP            4
MONEY           3
TIME            2
QUANTITY        1
LANGUAGE        1
Name: type, dtype: int64

##### Named entities that we are not considering:
- Cardinal, just numbers, which wer are not concerned about that could be linked to another Wiki page
- Product, such as Skype, kINECT, Xobx 360
- Percent
- Ordinal
- Money
- Time
- Quantity
- Language

#### NER: Organization

In [2]:
org_ls = df_unique[df_unique["type"] == "ORG"].text.tolist()
org_ls

NameError: name 'df_unique' is not defined

#### NER: Date

In [158]:
date_ls = df_unique[df_unique["type"] == "DATE"].text.tolist()
date_ls

['1955',
 '2015',
 'the twenty-first century',
 '1943',
 '1956',
 '1954',
 '1959',
 'the middle of the 1960s',
 'twenty years',
 '1974',
 'next few years',
 'the early 1980s',
 '1985',
 '1987',
 'the 1980s',
 '1989',
 'the late 1990s',
 'early 21st century',
 '11 May 1997',
 '2011',
 '2012',
 'March 2016',
 'the 2017',
 'two years',
 'year',
 '2017',
 '2016',
 '2020',
 '10,000 days',
 'the late 1980s',
 '1990s',
 '2019',
 '1988',
 'millions of years',
 '1984',
 '1982–1992',
 '2010s',
 'the 1940s and 1950s',
 '1960',
 '1980s',
 '1960s',
 'the 1960s and the 1970s',
 '1970',
 '1950s',
 'election year',
 '2005',
 'the next few hundred years',
 '2010',
 'the year 2029',
 '1863',
 '1998',
 'February 2020',
 'January 2015',
 '2001',
 '1968',
 '1999',
 '1951',
 '1986']

#### NER: Person

In [160]:
person_ls = df_unique[df_unique["type"] == "PERSON"].text.tolist()
person_ls

['Tesler',
 'Alpha',
 'Karel Čapek',
 'Turing',
 'John McCarthy',
 'Norbert Wiener',
 'Marvin Minsky',
 'James Lighthill',
 'A. Mead',
 'Mohammed Ismail',
 'Moore',
 'Garry Kasparov',
 'Watson',
 'Brad Rutter',
 'Ken Jennings',
 'Ke Jie',
 "Deep Blue's",
 'Murray Campbell',
 'Jack Clark',
 'Clark',
 'John Haugeland',
 'Roger Schank',
 'David Rumelhart',
 'Markov',
 'Joseph Weizenbaum',
 'Weizenbaum',
 'Wendell Wallach',
 'Wallach',
 'Charles T. Rubin',
 'David Chalmers',
 'Jerry Fodor',
 'Hilary Putnam',
 'John Searle',
 'Searle',
 'Vernor Vinge',
 'Ray Kurzweil',
 'Kevin Warwick',
 'Robert Ettinger',
 "Samuel Butler's",
 'George Dyson',
 'Pricewaterhouse',
 'Michael Osborne',
 'Carl Benedikt Frey',
 'Martin Ford',
 "Andrew Yang's",
 'Irakli Beridze',
 'Stephen Hawking',
 'Bill Gates',
 'Yuval Noah Harari',
 'Elon Musk',
 'Hawking',
 'Mark Hurd',
 'Mark Zuckerberg',
 'Isaac Asimov',
 'Asimov',
 'George Lucas',
 'Philip K. Dick',
 'Dick']

#### NER: WORK_OF_ART

In [167]:
work_of_art_ls = df_unique[df_unique["type"] == "WORK_OF_ART"].text.tolist()
work_of_art_ls

["Mary Shelley's Frankenstein",
 'Turing-complete "',
 'Analog VLSI Implementation',
 'Future of Go Summit',
 'Moral Machines  For Wallach',
 'part of the research landscape of artificial intelligence as guided by its two central questions which he identifies as "Does Humanity Want Computers Making Moral Decisions"',
 'Symposium on Machine Ethics',
 'Ethics',
 'the AAAI Fall 2005 Symposium on Machine Ethics',
 'I think',
 'Plug & Pray',
 'Star Trek Next Generation, with the character of Commander Data',
 'Edward Fredkin argues that "artificial intelligence is the next stage in evolution"',
 'Darwin among the Machines"',
 'book of the same name',
 'Nick Bostrom',
 'Human Compatible, AI researcher Stuart J. Russell',
 'A Space Odyssey',
 'The Terminator',
 'The Matrix',
 'Gort from The Day the Earth Stood Still',
 'Bishop from Aliens',
 'the Three Laws of Robotics',
 'Ghost in the Shell',
 "Hajime Sorayama's Sexy Robots",
 'Japan depicting the actual organic human form with lifelike musc

#### NER: GPE

In [170]:
gpe_ls = df_unique[df_unique["type"] == "GPE"].text.tolist()
gpe_ls

['U.S.',
 'US',
 'Japan',
 'U.S',
 'China',
 'Denver',
 'England',
 'Edinburgh',
 'the United States',
 'Russia',
 'the United Kingdom',
 'Republic',
 'Bostrom']

#### NER: LOC

In [176]:
loc_ls = df_unique[df_unique["type"] == "LOC"].text.tolist()
loc_ls

['East', 'San Francisco', 'West', 'Europe', 'Rodney Brooks', 'Earth']

#### NER: NORP

In [174]:
norp_ls = df_unique[df_unique["type"] == "NORP"].text.tolist()
norp_ls

['British', 'Bayesian', 'Americans', 'Chinese']

In [1]:
ls = org_ls + date_ls + person_ls + work_of_art_ls + gpe_ls + loc_ls + ner_ls + norp_ls

NameError: name 'org_ls' is not defined