### Import libraries

In [51]:
import nltk
from nltk.corpus import stopwords
import csv
from nltk.tag import pos_tag #For proper noun
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from nltk.stem import PorterStemmer
import math

### Loading and pre-processing the document

In [32]:
filename = "C:\\Users\\ayush\\Desktop\\TIET - 5th Sem 2020-21\\NLP\\COVID_19_dataset\\documents\\001.txt"
f = open(filename, 'r')
text = f.read()
f.close()

### Convert all words to lower case and removing stopWords

In [33]:
sent_tokens = nltk.sent_tokenize(text)
word_tokens = nltk.word_tokenize(text)
word_tokens_lower = [word.lower() for word in word_tokens]
stopWords = list(set(stopwords.words("english")))
word_tokens_refined = [x for x in word_tokens_lower if x not in stopWords]
print(len(word_tokens_refined))

205


In [34]:
stem = []
ps = PorterStemmer()
for word in word_tokens_refined:
    stem.append(ps.stem(word))
word_tokens_refined = stem
print(len(word_tokens_refined))

205


### Feature Extraction 
A. Sentence Features: 
 1. Cue-Phrases like example, therefore, important, according to, etc.
 2. Numerical Data like dates, transactions, year, age, etc.
 3. Sentence Length like too long or too short sentence are of little worth
 4. Sentene Position like starting and ending sentences are of more importance

B. Word Features: 
 1. Word Frequency
 2. Upper Case
 3. Proper Noun
 4. Heading Match

### A.1. Cue Phrases Calculation

In [35]:
QPhrases = ['examples','anyway','furhtermore','first','second','then','now','therefore','hence','lastly','finally','summary']
cuePhrases = {}
for sentence in sent_tokens:
    cuePhrases[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for word in word_tokens:
        if word.lower() in QPhrases:
            cuePhrases[sentence] += 1
maximumFreq = max(cuePhrases.values())
for k in cuePhrases.keys():
    try:
        cuePhrases[k] = cuePhrases[k]/maximumFreq
        cuePhrases[k] = round(cuePhrases[k],3)
    except ZeroDivisionError:
        x = 0
print(cuePhrases.values())

dict_values([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])


### A.2. Numerical Data Extraction

In [36]:
numericData = {}
for sentence in sent_tokens:
    numericData[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for word in word_tokens:
        if word.isdigit():
            numericData[sentence] += 1
maximumFreq = max(numericData.values())
for k in numericData.keys():
    try:
        numericData[k] = numericData[k]/maximumFreq
        numericData[k] = round(numericData[k],3)
    except ZeroDivisionError:
        x = 0
print(numericData.values())

dict_values([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])


### A.3. Sentence Length Feature

In [37]:
# if sentence is less than 10 words, reduce 5% value of that sentence
# if sentence is 10 to 20 words, keep maimum weight i.e. 1
# if sentence is greater than 20 words, reduce 5% value of that sentence

sentLenScore = {}
for sentence in sent_tokens:
    sentLenScore[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    if len(word_tokens) < 10:
        sentLenScore[sentence] = 1 - 0.05*(10-len(word_tokens))
        sentLenScore[sentence] = round(sentLenScore[sentence],4)
    elif len(word_tokens) > 20:
        sentLenScore[sentence] = 1 - 0.05*(len(word_tokens)-20)
        sentLenScore[sentence] = round(sentLenScore[sentence],4)
    else:
        sentLenScore[sentence] = 1
print(sentLenScore.values())

dict_values([0.8, -2.25, 1, -0.1, 1, 1, 0.6, 0.35, 1, 0.85])


### A.4. Sentence Position

In [38]:
sentencePosition = {}
d = 1 #Sentence number
no_of_sentences = len(sent_tokens)
for i in range(no_of_sentences):
    a = 1/d
    b = 1/(no_of_sentences-d+1)
    sentencePosition[sent_tokens[d-1]] = max(a,b)
    sentencePosition[sent_tokens[d-1]] = round(sentencePosition[sent_tokens[d-1]],3)
    d = d+1
print(sentencePosition.values())

dict_values([1.0, 0.5, 0.333, 0.25, 0.2, 0.2, 0.25, 0.333, 0.5, 1.0])


### B.1 Word Frequency

In [39]:
freqTable = {}
for word in word_tokens_refined:
    if word in freqTable:
        freqTable[word] += 1
    else:
        freqTable[word] = 1
for word in freqTable.keys():
    freqTable[word] = math.log10(freqTable[word]+1)
    freqTable[word] = round(freqTable[word],3)
print(freqTable.values())

dict_values([0.602, 0.477, 0.602, 0.602, 0.903, 0.477, 0.602, 0.301, 0.301, 0.477, 0.778, 0.699, 0.699, 0.602, 1.146, 0.477, 0.778, 0.602, 0.301, 1.041, 0.301, 0.477, 0.301, 0.301, 0.301, 0.477, 0.477, 0.699, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.477, 0.477, 0.602, 0.301, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.477, 0.477, 0.301, 0.301, 0.301, 0.477, 0.602, 0.477, 0.477, 0.301, 0.301, 0.301, 0.301, 0.477, 0.301, 0.301, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.477, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301, 0.301])


In [43]:
#Calculate sentence score according to word frequency
wordFreq = {}
for sentence in sent_tokens:
    wordFreq[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    f = []
    for word in word_tokens:
        f.append(ps.stem(word))
    for word,freq in freqTable.items():
        if word in f:
            wordFreq[sentence] += freq
print(wordFreq.values())

dict_values([12.467000000000002, 24.43299999999999, 5.801, 12.745000000000005, 6.151000000000002, 4.829000000000001, 9.337000000000002, 7.510000000000001, 6.260000000000001, 6.929000000000001])


### B.2 Upper Case Feature

In [48]:
upperCase = {}
for sentence in sent_tokens:
    upperCase[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for word in word_tokens:
        if k.isupper():
            upperCase[sentence] += 1
maxFreq = max(upperCase.values())
for k in upperCase.keys():
    try:
        upperCase[k] = upperCase[k]/maxFreq
        upperCase[k] = round(upperCase[k],3)
    except ZeroDivisionError:
        x = 0
print(upperCase.values())

dict_values([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


### B.3 Proper Noun Feature

In [52]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [53]:
properNoun = {}
for sentence in sent_tokens:
    tagged_sent = pos_tag(sentence.split())
    propernouns = [word for word,pos in tagged_sent if pos=='NNP']
    properNoun[sentence] = len(propernouns)
maxFreq = max(properNoun.values())
for k in properNoun.keys():
    try:
        properNoun[k] = properNoun[k]/maxFreq
        properNoun[k] = round(properNoun[k],3)
    except ZeroDivisionError:
        x = 0
print(properNoun.values())

dict_values([0.364, 1.0, 0.0, 0.909, 0.091, 0.0, 0.0, 0.0, 0.0, 0.0])


### B.4 Heading Match Feature

In [54]:
headMatch = {}
heading = sent_tokens[0]
for sentence in sent_tokens:
    headMatch[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for k in word_tokens:
        if k not in stopWords:
            k = ps.stem(k)
            if k in ps.stem(heading):
                headMatch[sentence] += 1
maxFreq = max(headMatch.values())
for k in headMatch.keys():
    try:
        headMatch[k] = headMatch[k]/maxFreq
        headMatch[k] = round(headMatch[k],3)
    except ZeroDivisionError:
        x = 0
print(headMatch.values())

dict_values([0.87, 1.0, 0.304, 0.478, 0.13, 0.174, 0.13, 0.13, 0.261, 0.217])


## Compiling all the features to get the summary

In [61]:
totalScore = {}
for k in cuePhrases.keys():
    totalScore[k] = cuePhrases[k] + numericData[k] + sentLenScore[k] + sentencePosition[k] + wordFreq[k] + upperCase[k] + properNoun[k] + headMatch[k]
print(totalScore.values())

dict_values([15.501000000000003, 24.68299999999999, 7.438000000000001, 15.282000000000005, 7.572000000000002, 6.203000000000001, 10.317000000000002, 8.323000000000002, 8.021, 9.996000000000002])


In [62]:
sumValues = 0
for sentence in totalScore:
    sumValues += totalScore[sentence]
average = sumValues/len(totalScore)

In [63]:
# Storing sentences into summary
summary = ''
for sentence in sent_tokens:
    if sentence in totalScore and totalScore[sentence] > (1.5*average):
        summary += ""+sentence
print(summary)

The fact that two coronavirus vaccines recently showed strong protection against COVID-19 bodes well for other leading programs led by AstraZeneca, Novavax, and Johnson & Johnson, Bill Gates said Tuesday.The billionaire Microsoft founder and philanthropist said it will be easier to boost manufacturing and distribute these other shots to the entire world, particularly developing nations.The vaccine space has seen a flurry of good news in recent days, marked by overwhelming success in late-stage trials by both Pfizer and Moderna.
