In [5]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [11]:
# Natural language processing library - NLTK in Python
import nltk
import string
from heapq import nlargest
from collections import defaultdict

In [7]:
text = "The main goal of using machine learning for text summarization is to reduce the reference text to a smaller version while keeping its knowledge alongside its meaning. Multiple text summary descriptions are provided, for example, explained the report as text generated from one or more documents that communicate relevant knowledge in the first text, and that is no longer than half of the main text and generally much more limited than this."

# to determine how many sentences the summary should contain
def no_of_sentenes(text):
    period_counts = text.count(". ")
    if period_counts > 20:
        return int(round(period_counts/10, 0))
    
    return 1


In [19]:
print(string.punctuation)

nopunc =[char for char in text if char not in string.punctuation]
nopunc = "".join(nopunc)
nopunc

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'The main goal of using machine learning for text summarization is to reduce the reference text to a smaller version while keeping its knowledge alongside its meaning Multiple text summary descriptions are provided for example explained the report as text generated from one or more documents that communicate relevant knowledge in the first text and that is no longer than half of the main text and generally much more limited than this'

In [10]:
# common words that are often removed in text analysis tasks
nltk.download('stopwords')

processed_text = [word for word in nopunc.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
processed_text

[nltk_data] Downloading package stopwords to /home/lowin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['main',
 'goal',
 'using',
 'machine',
 'learning',
 'text',
 'summarization',
 'reduce',
 'reference',
 'text',
 'smaller',
 'version',
 'keeping',
 'knowledge',
 'alongside',
 'meaning',
 'Multiple',
 'text',
 'summary',
 'descriptions',
 'provided',
 'example',
 'explained',
 'report',
 'text',
 'generated',
 'one',
 'documents',
 'communicate',
 'relevant',
 'knowledge',
 'first',
 'text',
 'longer',
 'half',
 'main',
 'text',
 'generally',
 'much',
 'limited']

In [13]:
word_freq = defaultdict(lambda: 1)
for word in processed_text:
    word_freq[word] += 1

# normalization scaling the word frequencies between 0 and 1.
max_freq = max(word_freq.values())

for word in word_freq.keys():
    word_freq[word] /= max_freq

word_freq

defaultdict(<function __main__.<lambda>()>,
            {'main': 0.42857142857142855,
             'goal': 0.2857142857142857,
             'using': 0.2857142857142857,
             'machine': 0.2857142857142857,
             'learning': 0.2857142857142857,
             'text': 1.0,
             'summarization': 0.2857142857142857,
             'reduce': 0.2857142857142857,
             'reference': 0.2857142857142857,
             'smaller': 0.2857142857142857,
             'version': 0.2857142857142857,
             'keeping': 0.2857142857142857,
             'knowledge': 0.42857142857142855,
             'alongside': 0.2857142857142857,
             'meaning': 0.2857142857142857,
             'Multiple': 0.2857142857142857,
             'summary': 0.2857142857142857,
             'descriptions': 0.2857142857142857,
             'provided': 0.2857142857142857,
             'example': 0.2857142857142857,
             'explained': 0.2857142857142857,
             'report': 0.2857142857

In [20]:
# Punkt tokenizer models for sentence tokenization
nltk.download('punkt')

sent_list = nltk.sent_tokenize(text)
print(sent_list)

sent_score = defaultdict(int)

for sent in sent_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_freq.keys():
            sent_score[sent] += word_freq[word]

sent_score

['The main goal of using machine learning for text summarization is to reduce the reference text to a smaller version while keeping its knowledge alongside its meaning.', 'Multiple text summary descriptions are provided, for example, explained the report as text generated from one or more documents that communicate relevant knowledge in the first text, and that is no longer than half of the main text and generally much more limited than this.']


[nltk_data] Downloading package punkt to /home/lowin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


defaultdict(int,
            {'The main goal of using machine learning for text summarization is to reduce the reference text to a smaller version while keeping its knowledge alongside its meaning.': 6.285714285714285,
             'Multiple text summary descriptions are provided, for example, explained the report as text generated from one or more documents that communicate relevant knowledge in the first text, and that is no longer than half of the main text and generally much more limited than this.': 9.714285714285715})

In [18]:
# extract the length number of sentences with the highest scores from the sent_score dictionary
summary_sents = nlargest(no_of_sentenes(text), sent_score, key=sent_score.get)
summary = " ".join(summary_sents)

'''
Overall it takes an input text, 
processes it by removing punctuation and common stop words, 
calculates word frequencies and sentence scores, and 
generates a summary by selecting the most important sentences based on their scores
'''

summary

'Multiple text summary descriptions are provided, for example, explained the report as text generated from one or more documents that communicate relevant knowledge in the first text, and that is no longer than half of the main text and generally much more limited than this.'