-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathnew_summarizer_text.py
72 lines (56 loc) · 2.14 KB
/
new_summarizer_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
class FrequencySummarizer:
def __init__(self, min_cut=0.1, max_cut=0.9):
self.min_cut = min_cut
self.max_cut = max_cut
self.stopwords = set(stopwords.words("english")+ list(punctuation))
def _compute_frequencies(self, word_sent):
# compute the frequency of each word
freq = defaultdict(int)
for s in word_sent:
for word in s:
if word not in self.stopwords:
freq[word] +=1
# frequencies normalization and filtering
m = float(max(freq.values()))
for w in freq.keys():
freq[w] = freq[w]/m
if freq[w] >= self.max_cut or freq[w] <= self.min_cut:
del freq[w]
return freq
def summarize(self, text, n):
"""
Return a list of n sentences
which represent the summary of text.
"""
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = [word_tokenize(s.lower()) for s in sents]
self._freq = self._compute_frequencies(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(word_sent):
for w in sent:
if w in self._freq:
ranking[i] += self._freq[w]
sents_idx = self._rank(ranking,n)
return [sents[j] for j in sents_idx]
def _rank(self, ranking, n):
""" return the first n sentences with highest ranking """
return nlargest(n, ranking, key=ranking.get)
"""
text = "" # input text
while True:
text = input("Enter the text to summarize\n")
if(len(text)>10):
break;
else:
print("Please input the text as length at least 10")
fs = FrequencySummarizer();
for s in fs.summarize(text,2):
print("*",s)
"""
print("Done!")