In [1]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
%matplotlib

Using matplotlib backend: TkAgg


In [2]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Loading Corpus

In [3]:
sentences = brown.sents('ca04')

In [4]:
len(sentences)

88

# Forming it as Sentences

In [5]:
sentlist=[' '.join(sent) for sent in sentences]

In [6]:
wholesent=""
for i in sentlist:
    wholesent+=i;
print(wholesent)



In [7]:
stop_words = set(stopwords.words("english"))
punctuations = set(string.punctuation)
pos_tags = {
            NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
            VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
            ADJ: ['JJ', 'JJR', 'JJS'],
            ADV: ['RB', 'RBR', 'RBS', 'WRB']
}

# Removing stop words

In [8]:
def remove_stop_words(words):
        return [w for w in words if w not in stop_words]

# Reforming sentences using regex

In [9]:
def remove_regex(sent):
        sent = " ".join([w.lower() for w in sent])
        sent = re.sub(r"i'm", "i am", sent)
        sent = re.sub(r"he's", "he is", sent)
        sent = re.sub(r"can't", "cannot", sent)
        sent = re.sub(r"don't", "do not", sent)
        sent = re.sub(r"that's", "that is", sent)
        sent = re.sub(r"\'ve", " have", sent)
        sent = re.sub(r"\'ll", " will", sent)
        sent = re.sub(r"what's", "what is", sent)
        sent = re.sub(r"where's", "where is", sent)
        sent = re.sub(r"\'re", " are", sent)
        sent = re.sub(r"\'d", " would", sent)
        sent = re.sub(r"she's", "she is", sent)
        sent = re.sub(r"won't", "will not", sent)
        patterns = re.finditer("#[\w]*", sent)
        for pattern in patterns:
            sent = re.sub(pattern.group().strip(), "", sent)
        sent = "".join(ch for ch in sent if ch not in punctuations)
        return sent

# pos tagging for all words

In [10]:
def posTagging(words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words

# Data preprocessing

In [11]:
def preprocessData(sentence):
    sentence= remove_regex(sentence)
    words = word_tokenize(sentence)
    cleanedWords = remove_stop_words(words)
    lem = WordNetLemmatizer()
    pos_words = posTagging(words)
    cleanedWords = [lem.lemmatize(w, pos=p) for w, p in pos_words]
    return cleanedWords

# Finding sentence similarity between two sentences

In [12]:
def findSentenceSimilarity(s1, s2):
    s1 = preprocessData(s1)
    s2 = preprocessData(s2)
    allWords = list(set(s1 + s2))
    vectorForS1 = [0] * len(allWords)
    vectorForS2 = [0] * len(allWords)
    for word in s1:
        vectorForS1[allWords.index(word)] += 1
    for word in s2:
        vectorForS2[allWords.index(word)] += 1
    return 1 - cosine_distance(vectorForS1, vectorForS2)

# Creating similarity matrix

In [13]:
def createSimilarityMatrix(sentences):
    matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                matrix[i][j] = findSentenceSimilarity(sentences[i], sentences[j])
    for i in range(len(matrix)):
        matrix[i] /= matrix[i].sum()
    return matrix

In [14]:
SimilarityMatrix = createSimilarityMatrix(sentences)

# Ranking sentences using PageRank Algorithm

In [15]:
def pagerank(matrix, eps=1.0e-8, d=0.85):
    N = matrix.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * matrix) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [16]:
ranks = pagerank(SimilarityMatrix)

In [17]:
ranks

array([[0.01099344],
       [0.00416014],
       [0.0171325 ],
       [0.00746358],
       [0.01551747],
       [0.02022457],
       [0.023656  ],
       [0.0228969 ],
       [0.0038337 ],
       [0.01416403],
       [0.01518317],
       [0.0087426 ],
       [0.00587603],
       [0.00823687],
       [0.01611414],
       [0.01837027],
       [0.01089948],
       [0.00061965],
       [0.00748002],
       [0.00098119],
       [0.0028046 ],
       [0.01694765],
       [0.00670167],
       [0.00512011],
       [0.01565224],
       [0.00453072],
       [0.014097  ],
       [0.01072731],
       [0.01298445],
       [0.00047974],
       [0.01410149],
       [0.0076991 ],
       [0.02060349],
       [0.01750957],
       [0.01849983],
       [0.02178995],
       [0.00515882],
       [0.00239848],
       [0.00934533],
       [0.00833235],
       [0.00449953],
       [0.01881024],
       [0.00526569],
       [0.01785498],
       [0.00588971],
       [0.0165893 ],
       [0.02293311],
       [0.016

In [18]:
sortRankWithindexes = [item[0] for item in sorted(enumerate(ranks), key=lambda item: -item[1])]

In [19]:
sortRankWithindexes

[6,
 72,
 46,
 7,
 55,
 35,
 32,
 59,
 5,
 64,
 41,
 34,
 15,
 62,
 83,
 43,
 33,
 57,
 2,
 21,
 86,
 47,
 45,
 14,
 60,
 24,
 4,
 77,
 10,
 63,
 75,
 9,
 51,
 30,
 26,
 78,
 56,
 50,
 71,
 28,
 54,
 68,
 66,
 69,
 61,
 84,
 0,
 80,
 16,
 27,
 38,
 81,
 53,
 11,
 49,
 39,
 13,
 31,
 18,
 3,
 48,
 79,
 74,
 22,
 82,
 44,
 12,
 42,
 36,
 23,
 25,
 40,
 1,
 58,
 8,
 87,
 73,
 20,
 37,
 65,
 70,
 52,
 76,
 67,
 85,
 19,
 17,
 29]

# Bar chart representing importance of all sentences

In [20]:
plt.figure(figsize=(20, 10))
plt.bar([item[0] for item in sorted(enumerate(ranks))], ranks.T[0],color='olive',width=0.8)
plt.ylabel("Page Rank / Importance")
plt.xlabel("Sentence No.")
plt.show()

In [21]:
SummaryLines = 5

# Selecting top sentences for summary

In [22]:
selectedSentences = sorted(sortRankWithindexes[:SummaryLines])

In [23]:
summary = itemgetter(*selectedSentences)(sentences)

In [24]:
finalSummary=""
for sentence in summary:
    finalSummary+=' '.join(sentence)

# Final Summary

In [25]:
finalSummary

'Explosion avoidedIn the case of Portugal , which a few weeks ago was rumored ready to walk out of the NATO Council should critics of its Angola policy prove harsh , there has been a noticeable relaxation of tension .His reply , he said , was that he agreed to the need for unity in the country now .But he did recommend that President Kennedy state clearly that if Communist countries shipped any further arms to Cuba that it would not be tolerated .The administration declared itself in favor of a neutralized Laos .'