In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vasil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vasil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

### Basic Idea for Step 1 ###
If a sentence contains the most keywords, it is the most important sentence. Rank the sentences by the number of keywords they contain. Then select the first n sentences as the summary.

Calculate the frequency score for every word in the article
Calculate the frequency score for every sentence
Rank sentences by frequency score
Get the first n sentences as summary

In [3]:
stopwords = set(stopwords.words('english') + list(punctuation)) # filter out stopwords and puncts
max_cut = 0.9 # filter out the word with highest frequency
min_cut = 0.1 # filter out the word with lowest frequency

In [5]:
"""
Calculate the frequency for each word
word_sent: list of words
return freq dict
freq[w]: the frequency of w
"""
def compute_frequencies(word_sent):
    """
    defaultdict is a dict with default value
    for int the default value is 0
    """
    freq = defaultdict(int)

    #count the appearances for each word
    for s in word_sent:
        for word in s:
            # exlude stopwords
            if word not in stopwords:
                freq[word] += 1

    # get the highest frequency m
    m = float(max(freq.values()))
    
    #all frequency / m
    for w in list(freq.keys()):
        freq[w] = freq[w]/m
        if freq[w] >= max_cut or freq[w] <= min_cut:
            del freq[w]
    # Return
    # {key:word, value: weight score}
    return freq

### Get the Summary ###
Adding up the frequency score in each word for every sentence.

In [6]:
def summarize(text, n):
    """
    function for summarize
    text: input news file
    n: number of sentences in summary
    return the list of sentences in the summary
    """

    # tokenize sentences
    sents = sent_tokenize(text)
    assert n <= len(sents)

    # tokenize words
    word_sent = [word_tokenize(s.lower()) for s in sents]

    # freq: dictionary for words and its frequency score
    freq = compute_frequencies(word_sent)
    
    #ranking: dictionary for sentences and its frequency score
    ranking = defaultdict(int)
    for i, word in enumerate(word_sent):
        for w in word:
            if w in freq:
                ranking[i] += freq[w]
    sents_idx = rank(ranking, n)
    return [sents[j] for j in sents_idx]

In [7]:
"""
Iteration will be very slow for long articles.
Use function in heapq
创建一个最小堆来完成这个功能
返回的是最小的n个数所在的位置
"""    
def rank(ranking, n):
    return nlargest(n, ranking, key=ranking.get)

In [8]:
if __name__ == '__main__':
    with open("news.txt", "r") as myfile:
        text = myfile.read().replace('\n','')
    res = summarize(text, 2)
    for i in range(len(res)):
        print(res[i])

"Modern life is dramatically different to even 30 years ago," Prof Gray told Radio 4's Today programme, "people now drive to work and sit at work."
"The How Are You Quiz will help anyone who wants to take a few minutes to take stock and find out quickly where they can take a little action to make a big difference to their health."


![title](img/news-1.png)

![title](img/news-2.png)

![title](img/news-3.png)

![title](img/news-4.png)

原文： [**'Middle age Health Crisis' Warning**](http://www.bbc.com/news/health-38402655)

### Problem ###
Simply adding up frequency score will prioritize long sentences. In the next step we need to improve the algorithm for summary.