# Named Entity Recognition on AI Wiki text

Author: Crystal

Major package used: Stanza 

In [10]:
import pandas as pd
import numpy as np
import json
import glob
import time


# web scrapping
import requests as r
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#name entity recognition
import stanza
stanza.download('en') # download English model

#spacy
import spacy
import nltk
from nltk.corpus import stopwords
#spacy.load("en_core_web_sm")

#visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
#warnings.simplefilter('always')
warnings.filterwarnings("ignore", category=DeprecationWarning)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.1.json:   0%|   …

2021-06-27 19:13:20 INFO: Downloading default packages for language: en (English)...
2021-06-27 19:13:21 INFO: File exists: /home/zz3hs/stanza_resources/en/default.zip.
2021-06-27 19:13:26 INFO: Finished downloading models and saved to /home/zz3hs/stanza_resources.


## Read in AI Wiki text

In [11]:
#texts = open("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text.txt", "r")
with open("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text.txt") as f:
    contents = f.read()

In [12]:
len(contents)
print("First paragraph:",contents[1:1000])

print("Last paragraph:", contents[61100:61759])

First paragraph: oneNoneArtificial intelligence (AI) is intelligence demonstrated by machines, unlike the natural intelligence displayed by humans and animals, which involves consciousness and emotionality. The distinction between the former and the latter categories is often revealed by the acronym chosen. 'Strong' AI is usually labelled as artificial general intelligence (AGI) while attempts to emulate 'natural' intelligence have been called artificial biological intelligence (ABI). Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of achieving its goals.  Colloquially, the term "artificial intelligence" is often used to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving". 
As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition

In [13]:
type(contents)

str

## Name entity recognition

In [14]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner') # initialize English neural pipeline, tokenize and named entity recognition

2021-06-27 19:13:29 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-06-27 19:13:29 INFO: Use device: cpu
2021-06-27 19:13:29 INFO: Loading: tokenize
2021-06-27 19:13:29 INFO: Loading: ner
2021-06-27 19:13:30 INFO: Done loading processors!


In [15]:
#Document will contain a list of Sentences, and the Sentences will contain lists of Tokens. 
doc = nlp(contents)

In [16]:
#extract named entities
entities = doc.entities

In [17]:
len(entities)

395

In [18]:
type(entities)

list

In [19]:
entities[0]

{
  "text": "Tesler",
  "type": "PERSON",
  "start_char": 1040,
  "end_char": 1046
}

In [20]:
df = pd.DataFrame()
for entity in entities:
    df = df.append({
        "text": entity.text,
        'type': entity.type,
        'start_char': entity.start_char,
        'end_char': entity.end_char
        }, ignore_index = True)

### Save name entity recognition on AI Wiki text as a panda dataframe

In [None]:
#df.to_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text_entity.csv', index = True) #export csv