In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [2]:
import scrape
page = scrape.read_page('https://en.wikipedia.org/wiki/Beyonc%C3%A9')
page

reading page: Beyonc%C3%A9


'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/; born September 4, 1981)[4] is an American singer, songwriter, actress, record producer and dancer. Born and raised in Houston, Texas, Beyoncé performed in various singing and dancing competitions as a child. She rose to fame in the late 1990s as lead singer of the R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the best-selling girl groups in history. Their hiatus saw Beyoncé\'s theatrical film debut in Austin Powers in Goldmember (2002) and the release of her first solo album, Dangerously in Love (2003). The album established her as a solo artist worldwide, debuting at number one on the US Billboard 200 chart and earning five Grammy Awards,[5] and featured the Billboard Hot 100 number one singles "Crazy in Love" and "Baby Boy".Following the break-up of Destiny\'s Child in 2006, she released her second solo album, B\'Day (2006), which contained her fourth number one single, "Irreplaceable", a

In [8]:
# paragraph = page[3782: 5188]

sentence = page[3782: 3950]
sentence

'Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager.'

In [9]:
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(ne_tree)

(S
  (PERSON Beyoncé/NNP)
  (PERSON Giselle/NNP Knowles/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Houston/NNP)
  ,/,
  (GPE Texas/NNP)
  ,/,
  to/TO
  Celestine/VB
  ``/``
  (GPE Tina/NNP)
  ''/''
  (GPE Knowles/NNP)
  (/(
  née/JJ
  Beyincé/NNP
  )/)
  ,/,
  a/DT
  hairdresser/NN
  and/CC
  salon/NN
  owner/NN
  ,/,
  and/CC
  (PERSON Mathew/NNP Knowles/NNP)
  ,/,
  a/DT
  Xerox/NNP
  sales/NNS
  manager/NN
  ./.)


In [10]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent = preprocess(sentence)
sent

[('Beyoncé', 'NNP'),
 ('Giselle', 'NNP'),
 ('Knowles', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('in', 'IN'),
 ('Houston', 'NNP'),
 (',', ','),
 ('Texas', 'NNP'),
 (',', ','),
 ('to', 'TO'),
 ('Celestine', 'VB'),
 ('``', '``'),
 ('Tina', 'NNP'),
 ("''", "''"),
 ('Knowles', 'NNP'),
 ('(', '('),
 ('née', 'JJ'),
 ('Beyincé', 'NNP'),
 (')', ')'),
 (',', ','),
 ('a', 'DT'),
 ('hairdresser', 'NN'),
 ('and', 'CC'),
 ('salon', 'NN'),
 ('owner', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('Mathew', 'NNP'),
 ('Knowles', 'NNP'),
 (',', ','),
 ('a', 'DT'),
 ('Xerox', 'NNP'),
 ('sales', 'NNS'),
 ('manager', 'NN'),
 ('.', '.')]

In [11]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Beyoncé/NNP
  Giselle/NNP
  Knowles/NNP
  was/VBD
  born/VBN
  in/IN
  Houston/NNP
  ,/,
  Texas/NNP
  ,/,
  to/TO
  Celestine/VB
  ``/``
  Tina/NNP
  ''/''
  Knowles/NNP
  (/(
  née/JJ
  Beyincé/NNP
  )/)
  ,/,
  (NP a/DT hairdresser/NN)
  and/CC
  (NP salon/NN)
  (NP owner/NN)
  ,/,
  and/CC
  Mathew/NNP
  Knowles/NNP
  ,/,
  a/DT
  Xerox/NNP
  sales/NNS
  (NP manager/NN)
  ./.)


In [12]:
# NPChunker = nltk.RegexpParser(pattern) 
# result = NPChunker.parse(sent)
# result.draw()

In [13]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Beyoncé', 'NNP', 'O'),
 ('Giselle', 'NNP', 'O'),
 ('Knowles', 'NNP', 'O'),
 ('was', 'VBD', 'O'),
 ('born', 'VBN', 'O'),
 ('in', 'IN', 'O'),
 ('Houston', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Texas', 'NNP', 'O'),
 (',', ',', 'O'),
 ('to', 'TO', 'O'),
 ('Celestine', 'VB', 'O'),
 ('``', '``', 'O'),
 ('Tina', 'NNP', 'O'),
 ("''", "''", 'O'),
 ('Knowles', 'NNP', 'O'),
 ('(', '(', 'O'),
 ('née', 'JJ', 'O'),
 ('Beyincé', 'NNP', 'O'),
 (')', ')', 'O'),
 (',', ',', 'O'),
 ('a', 'DT', 'B-NP'),
 ('hairdresser', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('salon', 'NN', 'B-NP'),
 ('owner', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('Mathew', 'NNP', 'O'),
 ('Knowles', 'NNP', 'O'),
 (',', ',', 'O'),
 ('a', 'DT', 'O'),
 ('Xerox', 'NNP', 'O'),
 ('sales', 'NNS', 'O'),
 ('manager', 'NN', 'B-NP'),
 ('.', '.', 'O')]


In [17]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp(paragraph)
pprint([(X.text, X.label_) for X in doc.ents])

[('Giselle Knowles', 'PERSON'),
 ('Houston', 'GPE'),
 ('Texas', 'GPE'),
 ('Mathew Knowles', 'PERSON'),
 ('Xerox', 'GPE'),
 ('Solange', 'NORP'),
 ('Solange', 'NORP'),
 ('first', 'ORDINAL'),
 ('1', 'CARDINAL'),
 ('Mathew', 'PERSON'),
 ('African American', 'NORP'),
 ('Tina', 'GPE'),
 ('Louisiana Creole', 'ORG'),
 ('African', 'NORP'),
 ('Native American', 'NORP'),
 ('Acadian', 'NORP'),
 ('Joseph Broussard.[20', 'PERSON'),
 ("St. Mary's", 'PERSON'),
 ('Montessori School', 'PERSON'),
 ('Houston', 'GPE'),
 ('Darlette Johnson', 'PERSON'),
 ('age seven', 'DATE'),
 ("John Lennon's", 'PERSON'),
 ('1990', 'DATE'),
 ('Parker Elementary School', 'ORG'),
 ('Houston', 'GPE'),
 ('the High School for the Performing and Visual Arts[26', 'ORG'),
 ("St. John's", 'GPE'),
 ('United Methodist Church', 'ORG'),
 ('two', 'CARDINAL')]


In [18]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Beyoncé, 'O', ''),
 (Giselle, 'B', 'PERSON'),
 (Knowles, 'I', 'PERSON'),
 (was, 'O', ''),
 (born, 'O', ''),
 (in, 'O', ''),
 (Houston, 'B', 'GPE'),
 (,, 'O', ''),
 (Texas, 'B', 'GPE'),
 (,, 'O', ''),
 (to, 'O', ''),
 (Celestine, 'O', ''),
 (", 'O', ''),
 (Tina, 'O', ''),
 (", 'O', ''),
 (Knowles, 'O', ''),
 ((, 'O', ''),
 (née, 'O', ''),
 (Beyincé, 'O', ''),
 (), 'O', ''),
 (,, 'O', ''),
 (a, 'O', ''),
 (hairdresser, 'O', ''),
 (and, 'O', ''),
 (salon, 'O', ''),
 (owner, 'O', ''),
 (,, 'O', ''),
 (and, 'O', ''),
 (Mathew, 'B', 'PERSON'),
 (Knowles, 'I', 'PERSON'),
 (,, 'O', ''),
 (a, 'O', ''),
 (Xerox, 'B', 'GPE'),
 (sales, 'O', ''),
 (manager.[17, 'O', ''),
 (], 'O', ''),
 (Beyoncé, 'O', ''),
 ('s, 'O', ''),
 (name, 'O', ''),
 (is, 'O', ''),
 (a, 'O', ''),
 (tribute, 'O', ''),
 (to, 'O', ''),
 (her, 'O', ''),
 (mother, 'O', ''),
 ('s, 'O', ''),
 (maiden, 'O', ''),
 (name.[18, 'O', ''),
 (], 'O', ''),
 (Beyoncé, 'O', ''),
 ('s, 'O', ''),
 (younger, 'O', ''),
 (sister, 'O', ''),
 (Sol

In [20]:
# Document-level NER
article = nlp(page)
len(article.ents)


1629

In [21]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 189,
         'DATE': 298,
         'EVENT': 28,
         'FAC': 15,
         'GPE': 132,
         'LANGUAGE': 3,
         'LAW': 5,
         'LOC': 13,
         'MONEY': 31,
         'NORP': 46,
         'ORDINAL': 99,
         'ORG': 283,
         'PERCENT': 3,
         'PERSON': 324,
         'PRODUCT': 36,
         'QUANTITY': 3,
         'TIME': 9,
         'WORK_OF_ART': 112})

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('first', 38), ('Jay-Z', 24), ('second', 18)]

In [35]:
sentences = [x for x in article.sents]
print(sentences[19])

In 2014, she became the highest-paid black musician in history and was listed among Time's 100 most influential people in the world for a second year in a row.[14] Forbes ranked her as the most powerful female in entertainment on their 2015 and 2017 lists, and in 2016, she occupied the sixth place for Time's Person of the Year.[15]


In [34]:
displacy.render(nlp(str(sentences[19])), jupyter=True, style='ent')

In [36]:
displacy.render(nlp(str(sentences[19])), style='dep', jupyter = True, options = {'distance': 120})

In [37]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[19])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('In', 'ADP', 'in'),
 ('2014', 'NUM', '2014'),
 ('highest', 'ADV', 'highest'),
 ('paid', 'VERB', 'pay'),
 ('black', 'ADJ', 'black'),
 ('musician', 'NOUN', 'musician'),
 ('history', 'NOUN', 'history'),
 ('listed', 'VERB', 'list'),
 ('Time', 'PROPN', 'time'),
 ("'s", 'PART', "'s"),
 ('100', 'NUM', '100'),
 ('influential', 'ADJ', 'influential'),
 ('people', 'NOUN', 'people'),
 ('world', 'NOUN', 'world'),
 ('second', 'ADJ', 'second'),
 ('year', 'NOUN', 'year'),
 ('row.[14', 'NOUN', 'row.[14'),
 ('Forbes', 'PROPN', 'forbes'),
 ('ranked', 'VERB', 'rank'),
 ('powerful', 'ADJ', 'powerful'),
 ('female', 'NOUN', 'female'),
 ('entertainment', 'NOUN', 'entertainment'),
 ('2015', 'NUM', '2015'),
 ('2017', 'NUM', '2017'),
 ('lists', 'NOUN', 'list'),
 ('2016', 'NUM', '2016'),
 ('occupied', 'VERB', 'occupy'),
 ('sixth', 'ADJ', 'sixth'),
 ('place', 'NOUN', 'place'),
 ('Time', 'PROPN', 'time'),
 ("'s", 'PART', "'s"),
 ('Person', 'PROPN', 'person'),
 ('Year.[15', 'PROPN', 'year.[15')]

In [38]:
dict([(str(x), x.label_) for x in nlp(str(sentences[19])).ents])

{'100': 'CARDINAL',
 '2014': 'DATE',
 '2015': 'DATE',
 '2016': 'DATE',
 '2017': 'DATE',
 'Forbes': 'PERSON',
 'Time': 'ORG',
 'a second year': 'DATE',
 'sixth': 'ORDINAL'}

In [39]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[19]])

[(In, 'O', ''), (2014, 'B', 'DATE'), (,, 'O', ''), (she, 'O', ''), (became, 'O', ''), (the, 'O', ''), (highest, 'O', ''), (-, 'O', ''), (paid, 'O', ''), (black, 'O', ''), (musician, 'O', ''), (in, 'O', ''), (history, 'O', ''), (and, 'O', ''), (was, 'O', ''), (listed, 'O', ''), (among, 'O', ''), (Time, 'B', 'ORG'), ('s, 'O', ''), (100, 'B', 'CARDINAL'), (most, 'O', ''), (influential, 'O', ''), (people, 'O', ''), (in, 'O', ''), (the, 'O', ''), (world, 'O', ''), (for, 'O', ''), (a, 'B', 'DATE'), (second, 'I', 'DATE'), (year, 'I', 'DATE'), (in, 'O', ''), (a, 'O', ''), (row.[14, 'O', ''), (], 'O', ''), (Forbes, 'B', 'PERSON'), (ranked, 'O', ''), (her, 'O', ''), (as, 'O', ''), (the, 'O', ''), (most, 'O', ''), (powerful, 'O', ''), (female, 'O', ''), (in, 'O', ''), (entertainment, 'O', ''), (on, 'O', ''), (their, 'O', ''), (2015, 'B', 'DATE'), (and, 'O', ''), (2017, 'B', 'DATE'), (lists, 'O', ''), (,, 'O', ''), (and, 'O', ''), (in, 'O', ''), (2016, 'B', 'DATE'), (,, 'O', ''), (she, 'O', ''), (