In [48]:
## came from the pluralsight course, 
## 'https://app.pluralsight.com/library/courses/python-natural-language-processing/table-of-contents'

import nltk

## stop words,
nltk.download('stopwords')

## part of speech,
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dmpas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dmpas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [4]:
text = 'Mary had a little lamb. Her fleece was white as snow.'

In [None]:
## tokenize

In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [7]:
sentences = sent_tokenize(text)
sentences

['Mary had a little lamb.', 'Her fleece was white as snow.']

In [9]:
terms = [ word_tokenize(sentence) for sentence in sentences ]
terms

[['Mary', 'had', 'a', 'little', 'lamb', '.'],
 ['Her', 'fleece', 'was', 'white', 'as', 'snow', '.']]

In [None]:
## remove punctuation and stopwords

In [13]:
from string import punctuation
from nltk.corpus import stopwords

In [34]:
customStopWords = list(set(stopwords.words('english') + list(punctuation)))

words = [ ]
for sentence in terms:
    for word in sentence:
        if word in customStopWords:
            continue
        words.append(word)

words

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']

In [35]:
## Bigrams

In [36]:
from nltk.collocations import *

In [42]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)

## distinct bigrams and frequencies,
sorted(list(finder.ngram_fd.items()), key=lambda x: x[1], reverse=True)

[(('Mary', 'little'), 1),
 (('little', 'lamb'), 1),
 (('lamb', 'Her'), 1),
 (('Her', 'fleece'), 1),
 (('fleece', 'white'), 1),
 (('white', 'snow'), 1)]

In [None]:
## different morphological forms, stemming

In [45]:
from nltk.stem.lancaster import LancasterStemmer

In [46]:
text2 = 'Mary closed on closing night when she was in the mood to close.'

st = LancasterStemmer()
stemmedWords = [ st.stem(word) for word in word_tokenize(text2) ]
stemmedWords

['mary',
 'clos',
 'on',
 'clos',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'clos',
 '.']

In [50]:
nltk.pos_tag(
    word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [None]:
## pull article, compute a summary,

In [56]:
import urllib.request as urllib2
from bs4 import BeautifulSoup

In [66]:
url = 'https://www.washingtonpost.com/powerpost/senates-sluggish-start-reflects-the-republicans-narrow-policy-agenda/2019/03/20/4590acf4-4b25-11e9-9663-00ac73f49662_story.html?utm_term=.a63589b0a0e8'

In [67]:
def get_article(url):
    page = urllib2.urlopen(url).read().decode('utf8', 'ignore')
    soup = BeautifulSoup(page, 'lxml')
    
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    text = text.encode('ascii', errors='replace').decode('ascii').replace('?', ' ')
    return text

In [69]:
text = get_article(url)
text



In [76]:
## lower,
words = [ term.lower() for term in word_tokenize(text) ]

## remove stopwords,
_stopwords = set(stopwords.words('english') + list(punctuation))
words = [ word for word in words if word not in _stopwords ]
words[:10]

['paul',
 'kane',
 'paul',
 'kane',
 'senior',
 'congressional',
 'correspondent',
 'columnist',
 'email',
 'bio']

In [82]:
from nltk.probability import FreqDist
from heapq import nlargest, nsmallest

In [85]:
freq = FreqDist(words)
nlargest(10, freq, key=freq.get)

['senate',
 'mcconnell',
 'democrats',
 'trump',
 'house',
 'vote',
 'year',
 'republicans',
 'first',
 'bill']

In [86]:
from collections import defaultdict

In [92]:
ranking = defaultdict(int)

sents = sent_tokenize(text)
for i, sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]

In [93]:
## the sum of freq of all the terms in a sent,
ranking

defaultdict(int,
            {0: 66,
             1: 95,
             2: 70,
             3: 52,
             4: 41,
             5: 64,
             6: 42,
             7: 30,
             8: 56,
             9: 8,
             10: 77,
             11: 6,
             12: 19,
             13: 26,
             14: 11,
             15: 42,
             16: 19,
             17: 42,
             18: 39,
             19: 33,
             20: 51,
             21: 35,
             22: 29,
             23: 32,
             24: 18,
             25: 21,
             26: 58,
             27: 30,
             28: 62,
             29: 104,
             30: 76,
             31: 34,
             32: 50,
             33: 52,
             34: 39,
             35: 29,
             36: 14,
             37: 33,
             38: 57,
             39: 14})

In [98]:
sents_idx = nlargest(4, ranking, key = ranking.get)

' '.join(
    [ sents[i] for i in sorted(sents_idx) ])



In [103]:
## final def,
def summarize_article(text, n):
    ''' summarize the given text with the best n sentences. '''
    
    sents = sent_tokenize(text)
    assert n <= len(sents)
    
    word_sent = word_tokenize(text.lower())
    
    _stopwords = set(stopwords.words('english') + list(punctuation))
    words = [ word for word in word_sent if word not in _stopwords ]
    
    freq = FreqDist(words)
    
    ranking = defaultdict(int)
    
    sents = sent_tokenize(text)
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]

    sents_idx = nlargest(n, ranking, key = ranking.get)
    return [ sents[i] for i in sorted(sents_idx) ]

In [104]:
summarize_article(text, 3)

['When senators return Monday night for a vote on a judicial nominee, it will be just their 50th roll call of the year, and with only a couple of other votes likely for the week, the Senate will hit the three-month mark of 2019 about 50 percent behind the pace that Majority Leader Mitch McConnell (R-Ky.) set in early 2017 after President Trump took office.',
 'This is your fault : GOP senators clash over shutdown inside private luncheon With Democrats taking over the House, McConnell lowered expectations for this year s legislative output and reset his key priority of confirming Trump s nominees to the federal courts.',