In [None]:
import re
import nltk
import string
import heapq


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
original_text="""Artificial intelligence is human like intelligence.
                  It is the study of intelligent artificial agents.
                  Science and engineering to produce intelligent machines.
                  Solve problems and have intelligence.
                  Related to intelligent behavior.
                  Developing of reasoning machines.
                  Learn from mistakes and successes.
                  Artificial intelligence is related to reasoning in everyday situations."""
original_text = re.sub(r'\s+',' ',original_text)
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens =[]
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text



In [None]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

In [None]:
def calculate_sentences_score(sentences, important_words, distance):
  scores =[]
  sentence_index = 0

  for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]:
    #print('--------------')
    #print(sentence)

    word_index = []
    for word in important_words:
      #print(word)
      try:
        word_index.append(sentence.index(word))
      except ValueError:
        pass

    #print(word_index)

    if len(word_index) == 0:
      continue

    groups_list = []
    group = [word_index[0]]
    i = 1
    while i < len(word_index):
      if word_index[i] - word_index[i - 1] < distance:
        group.append(word_index[i])
          #print('group',group)
      else:
        groups_list.append(group[:])
        group = [word_index[i]]
          #print('group',group)
      i += 1
    groups_list.append(group)
    #print('all groups',groups_list)

    max_group_score = 0
    for g in groups_list:
      #print(g)
      important_words_in_group = len(g)
      total_words_in_group = g[-1] - g[0] +1
      if total_words_in_group == 0:
        score = 0
      else:
        score = 1.0 * important_words_in_group** 2 / total_words_in_group
        #print('group score', score)

      if score > max_group_score:
        max_group_score = score
    scores.append((max_group_score, sentence_index))
    sentence_index += 1
  print('final scores',scores)
  return scores








In [None]:
test = ['a','b','c']
test.index('a')

0

In [None]:
def summarize(text, top_n_words, distance, number_of_sentences):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  #print(original_sentences)
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  #print(formatted_sentences)
  words = [ word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)]
  #print(words)
  frequency = nltk.FreqDist(words)
  #print(frequency)
  #return frequency
  top_n_words = [word[0] for word in frequency.most_common(top_n_words)]
  #print(top_n_words)
  sentences_score = calculate_sentences_score(formatted_sentences, top_n_words, distance)
  best_sentences = heapq.nlargest(number_of_sentences, sentences_score)
  #print(best_sentences)
  best_sentences = [original_sentences[i] for (score, i) in best_sentences]
  #print(best_sentences)
  return original_sentences, best_sentences, sentences_score


In [None]:
original_sentences, best_sentences, sentence_score = summarize(original_text, 5, 2, 3)

final scores [(0, 0), (0, 1), (2.0, 2), (1.0, 3), (0, 4), (1.0, 5), (1.0, 6)]


In [None]:
original_sentences

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [None]:
best_sentences

['Science and engineering to produce intelligent machines.',
 'Learn from mistakes and successes.',
 'Developing of reasoning machines.']

In [None]:
from IPython.core.display import HTML

In [None]:
text = ''
display(HTML(f'<h2>Summary</h2>'))
for sentence in original_sentences:
  #print(sentence)
  #text += sentence
  if sentence in best_sentences:
    text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence

display(HTML(f"""{text}"""))