In [1]:
import nltk
import wikipedia
import re

from nltk import Tree
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter

In [2]:
def tokenCounts(tokens):
    counts = Counter(tokens)
    sortedCounts = sorted(counts.items(), key=lambda count:count[1], reverse=True)
    return sortedCounts

def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

In [3]:
file = open("text.txt", "r")
text = file.read()

In [4]:
print(text[:500])

The Project Gutenberg eBook, The Drunkard, by Cyril Arthur Edward Ranger
Gull


This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org





Title: The Drunkard


Author: Cyril Arthur Edward Ranger Gull



Release Date: October 22, 2012  [eBook #41139]

Language: English


***START OF THE PROJECT GUTENBERG


In [5]:
stops = stopwords.words('english')
tokens = nltk.word_tokenize(text)
 
filtered_tokens = [token for token in tokens if token not in punctuation]
filtered_tokens = [token for token in filtered_tokens if token not in stops]

In [6]:
sentences = nltk.sent_tokenize(text)
 
len(sentences)

9325

In [7]:
tagged = nltk.pos_tag(tokens)
tagged_filtered = nltk.pos_tag(filtered_tokens)

In [8]:
print("Unfiltered:",tagged[:10])

Unfiltered: [('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'NN'), (',', ','), ('The', 'DT'), ('Drunkard', 'NNP'), (',', ','), ('by', 'IN'), ('Cyril', 'NNP')]


In [9]:
print("filtered:",tagged_filtered[:10])

filtered: [('The', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('eBook', 'VBD'), ('The', 'DT'), ('Drunkard', 'NNP'), ('Cyril', 'NNP'), ('Arthur', 'NNP'), ('Edward', 'NNP'), ('Ranger', 'NNP')]


In [10]:
ne_chunked = nltk.ne_chunk(tagged_filtered, binary=True)
nre = extractEntities(ne_chunked)
len(nre)

1006

In [11]:
{a:nre[a] for a in list(nre.keys())[:10]}

{'Audience': 'NE',
 'Grand Duke Alexis': 'NE',
 'Greek Euripidean': 'NE',
 'Helena': 'NE',
 'Lord Quinton': 'NE',
 'Mr. Helzephron': 'NE',
 'Mr. Rockefeller': 'NE',
 'PROJECT': 'NE',
 'Prison': 'NE',
 'Rockefeller American': 'NE'}

In [12]:
ne_chunked_false = nltk.ne_chunk(tagged_filtered, binary=False)
nre_class = extractEntities(ne_chunked_false)
len(nre_class)

1364

In [13]:
{a:nre_class[a] for a in list(nre_class.keys())[:10]}

{'Audience': 'ORGANIZATION',
 'Grand Duke Alexis': 'FACILITY',
 'Hands': 'PERSON',
 'Helena': 'PERSON',
 'Hood': 'PERSON',
 'III': 'ORGANIZATION',
 'PROJECT': 'ORGANIZATION',
 'Spring Partner': 'PERSON',
 'St.': 'ORGANIZATION',
 'Well': 'PERSON'}

In [14]:
custom_nre = []
entity = []
for tagged_entry in tagged:
    if(tagged_entry[1].startswith("NN") or (entity and tagged_entry[1].startswith("IN"))):
        entity.append(tagged_entry)
    else:
        if(entity and entity[-1][1].startswith("IN")):
            entity.pop()
        if(entity and " ".join(e[0] for e in entity)[0].isupper()):
           custom_nre.append(" ".join(e[0] for e in entity))
        entity = []
len(custom_nre)

4463

In [15]:
custom_nre[:10]

['Project Gutenberg eBook',
 'Drunkard',
 'Cyril Arthur Edward Ranger Gull',
 'Project Gutenberg License',
 'Title',
 'Drunkard Author',
 'Cyril Arthur Edward Ranger Gull Release Date',
 'October',
 'Language',
 'Mark C. Orton']

In [16]:
limit = 10
for key in nre.keys():
    try:
        page = wikipedia.page(key)
    except wikipedia.exceptions.DisambiguationError as e:
        page = wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        page = wikipedia.page("Object (philosophy)")
    summary = re.sub("\([^\)]*\)|\[\w*\]", "", page.summary)
    summary = re.sub(key, "", summary)
    summary = nltk.sent_tokenize(summary)
    summary = nltk.word_tokenize(summary[0])
    summary = nltk.pos_tag(summary)
    grammar = "NP: {<DT>?<JJ>*<NN>(<IN>?<DT>?<JJ>*(<NN>|<NNP>*))?}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(summary)
    for child in result:
        if isinstance(child, Tree):               
            if child.label() == 'NP':
                print("{}:".format(key), " ".join(e[0] for e in child.leaves()))
                break
    limit -= 1
    if limit == 0:
        break

Lord Quinton: moral philosopher
Grand Duke Alexis: the fifth child
Audience: An audience
Mr. Helzephron: An object




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Mr. Rockefeller: an American oil industry
PROJECT: contemporary business
Prison: A prison
Helena: the state capital
Rockefeller American: an American oil industry
Greek Euripidean: An object


In [17]:
limit = 10
for key in custom_nre:
    try:
        page = wikipedia.page(key)
    except wikipedia.exceptions.DisambiguationError as e:
        page = wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        page = wikipedia.page("Object (philosophy)")
    summary = re.sub("\([^\)]*\)|\[\w*\]", "", page.summary)
    summary = re.sub(key, "", summary)
    summary = nltk.sent_tokenize(summary)
    summary = nltk.word_tokenize(summary[0])
    summary = nltk.pos_tag(summary)
    grammar = "NP: {<DT>?<JJ>*<NN>(<IN>?<DT>?<JJ>*(<NN>|<NNP>*))?}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(summary)
    for child in result:
        if isinstance(child, Tree):               
            if child.label() == 'NP':
                print("{}:".format(key), " ".join(e[0] for e in child.leaves()))
                break
    limit -= 1
    if limit == 0:
        break

Project Gutenberg eBook: a volunteer effort
Drunkard: Alcoholism
Cyril Arthur Edward Ranger Gull: the pen name of
Project Gutenberg License: action film
Title: A title
Drunkard Author: an American temperance play
Cyril Arthur Edward Ranger Gull Release Date: an educational film production
October: the tenth month of the year
Language: a system
Mark C. Orton: an American composer
