In [1]:
# Importing the libraries

import numpy as np
import pandas as pd
import nltk
import re
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

In [2]:
# Reading the text from a text file

file = open("TextOutput.txt", encoding="utf8")
text = file.read()

In [3]:
# Getting the sentences

sentences = sent_tokenize(text)

In [4]:
# Remove punctuations, numbers and special characters

clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

In [5]:
# Make alphabets lowercase

clean_sentences = [s.lower() for s in clean_sentences]

In [6]:
# Getting the stopwords (and, to etc.) so that it can be removed
# Stopwords are removed so that more important parts of the sentence can be focussed on

stop_words = stopwords.words('english')

In [7]:
# Function to remove stopwords

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [8]:
# Remove stopwords from the sentences

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [9]:
# Extract word vectors using glove

word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [10]:
# Extract sentence vectors

sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [11]:
# Similarity matrix (intialised as a square matrix of 0s)
# The size of this matrix will be n x n, where n is the number of sentences

sim_mat = np.zeros([len(sentences), len(sentences)])

In [12]:
# Cosine similarity of a sentence is calculated with every other sentence and stored in the sim_mat matrix
# Cosine similarity is the dot product of the vector values of the sentences
# Higher this value, stronger is the relationship between the 2 sentences (Smaller theta value)
# This is not done for the same sentence as we aren't interested in the relationship of a sentence with itself 
# Hence, it is kept as 0

for i in range(len(sentences)):
      for j in range(len(sentences)):
        if i != j:
              sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [13]:
# Converting the matrix into a graph
# Scores is a dictionary, which contains the rank of each sentence
# Higher score(rank) means the sentence is more important

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [14]:
# Ranking the sentences in DESCENDING order
# This will keep only the most important sentences
# Keys and values are used to order the sentences later

ranked_sentences = sorted(([scores[i],s] for i,s in enumerate(sentences)), reverse=True)
keys = list(scores.keys())
values = list(scores.values())

In [15]:
# Short summary (1/4th the size of the original text)
# Ordered sentences contain the most important sentences in the order at which it was obtained

s_n = int((1/4) * len(ranked_sentences))
short_ranked_sentences = ranked_sentences[: s_n + 1]
short_ordered_sentences = []
for i, j in short_ranked_sentences:
    ind = values.index(i)
    short_ordered_sentences.append([ind, j])
short_ordered_sentences = sorted(short_ordered_sentences, key=lambda x: x[0])
f = open("TextSumShort.txt", 'w')
for i in range(len(short_ordered_sentences)):
    f.write(short_ordered_sentences[i][1])
    f.write("\n")
f.close()

In [16]:
# Long summary (3/4th the size of the original text))

l_n = int((3/4) * len(ranked_sentences))
long_ranked_sentences = ranked_sentences[: l_n + 1]
long_ordered_sentences = []
for i, j in long_ranked_sentences:
    ind = values.index(i)
    long_ordered_sentences.append([ind, j])
long_ordered_sentences = sorted(long_ordered_sentences, key=lambda x: x[0])
f = open("TextSumLong.txt", 'w')
for i in range(len(long_ordered_sentences)):
    f.write(long_ordered_sentences[i][1])
    f.write("\n")
f.close()

In [17]:
# Medium summary (1/2 the size of the original text)

m_n = int((1/2) * len(ranked_sentences))
medium_ranked_sentences = ranked_sentences[: m_n + 1]
medium_ordered_sentences = []
for i, j in medium_ranked_sentences:
    ind = values.index(i)
    medium_ordered_sentences.append([ind, j])
medium_ordered_sentences = sorted(medium_ordered_sentences, key=lambda x: x[0])
f = open("TextSumMed.txt", 'w')
for i in range(len(medium_ordered_sentences)):
    f.write(medium_ordered_sentences[i][1])
    f.write("\n")
f.close()