Importing Libraries

In [25]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize,word_tokenize
import numpy as np
import networkx as nx
import string
import re


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
from google.colab import drive
drive.mount('/content/drive' )

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Initializing variables

In [27]:
file_name = "/content/drive/MyDrive/LY Project/inputtext.txt"
stop_words = stopwords.words("english")
maximum_lines = 5

Reading text file and extracting sentences

In [28]:

def extractSentences(file_name):
  with open(file_name, 'r') as fh:
    file_data = ''
    for line in fh:
        line = line.rstrip("\n")
        file_data += line+'. '
  '''
  file = open(file_name, "r")
  file_data = file.readlines()
  file_data[0].replace("(\\t|\\r?\\n)+", " ");
  '''
  file_content = file_data.split(". ")
  sentences = []
  for sentence in file_content:
    if len(sentence)>0:
      sentence = re.sub(r'[^\w\s]', '', sentence)
      sentences.append(sentence.replace("[^a-zA-Z]", " ").split())
  return sentences


Find Similarity between two sentences using Cosine Distance

In [29]:
def findSentenceSimilarity(sentence1, sentence2, stopwords = None):
  if stopwords == None:
    stopwords = []
  #Convert each word to lower case
  sentence1 = [word.lower() for word in sentence1]
  sentence2 = [word.lower() for word in sentence2]

  #Get a list of all unique words from the two sentences
  unique_word_list = list(set(sentence1+sentence2))
  vector1 = [0]*len(unique_word_list)
  vector2 = [0]*len(unique_word_list)
  #Removing stop words and calculating word frequency vector for each sentence
  for word in sentence1:
    if word in stopwords:
      continue
    vector1[unique_word_list.index(word)] += 1
  for word in sentence2:
    if word in stopwords:
      continue
    vector2[unique_word_list.index(word)] += 1
  #Finding the cosine distance
  return 1 - cosine_distance(vector1, vector2)

Generating Similarity Matrix

In [30]:
def generateSimilarityMatrix(sentences, stopwords):
  similarity_matrix = np.zeros((len(sentences), len(sentences)),dtype = None)
  for index1 in range(len(sentences)):
    for index2 in range(len(sentences)):
      if index1!=index2:
        similarity_matrix[index1][index2] = findSentenceSimilarity(sentences[index1], sentences[index2], stopwords)
  return similarity_matrix


Generating Summary

In [31]:
def generateSummary(file_name, max_lines):
  summarized_text = []
  #Generating similarity matrix
  sentences = extractSentences(file_name)
  #print(sentences, "\n\n")
  sentence_similarity_matrix = generateSimilarityMatrix(sentences, stop_words)
  #Ranking the sentences from the similarity matrix
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
  #Scoring elements from the graph
  scores = nx.pagerank(sentence_similarity_graph)
  #Ranking and sorting sentences based on their scores
  ranked_sentences = sorted(((scores[i], sentence) for i,sentence in enumerate(sentences)), reverse = True)
  #Selecting only the required number of lines
  for i in range(max_lines):
    summarized_text.append(" ".join(ranked_sentences[i][1]))
  return summarized_text


Printing the Summary

In [32]:
def printSummary(summarized_text):
  print("Summary: \n", ". \n".join(summarized_text))
""
summarized_text = generateSummary(file_name, maximum_lines)
printSummary(summarized_text)

Summary: 
 Winston Churchill was an inspirational statesman writer orator and leader who led Britain to victory in the Second World War. 
However Labour leader Clement Attlees unexpected General Election victory in 1945 saw Churchill out of office and once again concentrating on public speaking. 
Churchill who also adopted the selfcreated position of Minister for Defence was active both in administrative and diplomatic functions in prosecuting the British war effort. 
The interwar years saw Churchill again cross the floor from the Liberals back to the Conservative Party. 
Following Neville Chamberlains resignation in 1940 Churchill was chosen to succeed him as Prime Minister of an allparty coalition government
