In [1]:
from goose3 import Goose
import nltk
from nltk.stem import WordNetLemmatizer
import string 
import re
import heapq


g = Goose()
%store -r url
print(url)
#url='https://towardsdatascience.com/a-quick-introduction-to-text-summarization-in-machine-learning-3d27ccf18a9f'
article = g.extract(url)


nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')


def preprocess(text):
    formatted_text = text.lower()#will change the text to lower case first step of preprocessing 
    tokens = [] # defining the list in python #removing the stop words second preprocessing steps we are going to gothrough each words
    for token in nltk.word_tokenize(formatted_text): #nltk is a library and word tokenize is a sub function and passing formatted text as a parameter which checks each and every word and consideres the stop words 
        tokens.append(token)
#    print(tokens)#will separate the words in a list 
    tokens =[word for word in tokens if word not in stopwords and word not in string.punctuation]
    formatted_text = ' '.join(element for element in tokens )
    return formatted_text


def calculate_sentences_score(sentences, important_words, distance):
  scores = []
  sentence_index = 0

  for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]:
    #print('------------')
    #print(sentence)

    word_index = []
    for word in important_words:
      #print(word)
      try:
        word_index.append(sentence.index(word))
      except ValueError:
        pass

    word_index.sort()
    #print(word_index)

    if len(word_index) == 0:
      continue

    # [0, 1, 5]
    groups_list = []
    group = [word_index[0]]
    i = 1 # 3
    while i < len(word_index): # 3
      # first execution: 1 - 0 = 1
      # second execution: 2 - 1 = 1
      if word_index[i] - word_index[i - 1] < distance:
        group.append(word_index[i])
        #print('group', group)
      else:
        groups_list.append(group[:])
        group = [word_index[i]]
        #print('group', group)
      i += 1
    groups_list.append(group)
    #print('all groups', groups_list)

    max_group_score = 0
    for g in groups_list:
      #print(g)
      important_words_in_group = len(g)
      total_words_in_group = g[-1] - g[0] + 1
      score = 1.0 * important_words_in_group**2 / total_words_in_group
      #print('group score', score)

      if score > max_group_score:
        max_group_score = score

    scores.append((max_group_score, sentence_index))
    sentence_index += 1

  #print('final scores', scores)
  return scores


def summarize(text, top_n_words, distance, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  #print(original_sentences)
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  #print(formatted_sentences)
  words = [word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)]
  #print(words)
  frequency = nltk.FreqDist(words)
  #print(frequency)
  #return frequency
  top_n_words = [word[0] for word in frequency.most_common(top_n_words)]
  #print(top_n_words)
  sentences_score = calculate_sentences_score(formatted_sentences, top_n_words, distance)
  #print(sentences_score)
  if percentage > 0:
    best_sentences_2 = heapq.nlargest(int(len(formatted_sentences) * percentage), sentences_score)
  else:  
    best_sentences_2 = heapq.nlargest(number_of_sentences, sentences_score)
  #print(best_sentences_2)
  best_sentences_2 = [original_sentences[i] for (score, i) in best_sentences_2]
  #print(best_sentences_2)
  return original_sentences, best_sentences_2, sentences_score

original_sentences, best_sentences_2, sentences_score = summarize(article.cleaned_text, 20, 20, 20)

summary_2 = ' '.join(best_sentences_2)
summary_2

%store summary_2

from IPython.core.display import HTML
import speech_recognition as sr
from gtts import gTTS
from IPython.display import Audio

def visualize(title, score_sentences, best_sentences_2):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary_2- {title}</h1>'))
  for sentence in score_sentences:
    if sentence in best_sentences_2:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

visualize(article.title, original_sentences,  best_sentences_2)

gtts= gTTS(summary_2)
gtts.save('1.wav')
sound_file = '1.wav'
Audio(sound_file, autoplay=True)


https://towardsdatascience.com/a-quick-introduction-to-text-summarization-in-machine-learning-3d27ccf18a9f


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


Stored 'summary_2' (str)


[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
