## 1. Import Libraries

In [22]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from collections import defaultdict

import bs4 as BS
import urllib.request

## 2. Fetch data from URL and extract necessary content

In [28]:
#Extract data from URL
data = urllib.request.urlopen('https://en.wikipedia.org/wiki/COVID-19_pandemic_in_India')
                              
print(type(data))

<class 'http.client.HTTPResponse'>


In [29]:
#reading the data
data_read = data.read()

print(data_read)

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>COVID-19 pandemic in India - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"38425e15-bc02-4894-a6cf-cf710687a162","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"COVID-19_pandemic_in_India","wgTitle":"COVID-19 pandemic in India","wgCurRevisionId":976126697,"wgRevisionId":976126697,"wgArticleId":63265538,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All pages needing factual verification","Wikipedia articles needing factual verification from June 2020","Wikipedia articl

In [30]:
#parsing the data after reading it 
data_parsed = BS.BeautifulSoup(data_read,'html.parser')

print(data_parsed)

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>COVID-19 pandemic in India - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"38425e15-bc02-4894-a6cf-cf710687a162","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"COVID-19_pandemic_in_India","wgTitle":"COVID-19 pandemic in India","wgCurRevisionId":976126697,"wgRevisionId":976126697,"wgArticleId":63265538,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All pages needing factual verification","Wikipedia articles needing factual verification from June 2020","Wikipedia articles nee

Extract only the paragraphs from the parsed html file

In [31]:
#Retrieve paragraphs
paragraphs = data_parsed.find_all('p')

print("Number of paragraphs : ",len(paragraphs))

#Store all the paragraphs as a single content to be summarized
content = ''

for para in paragraphs:
    content = content + para.text
    
print("Content to be summarized : ")
print(content)

Number of paragraphs :  115
Content to be summarized : 

The COVID-19 pandemic in India is part of the worldwide pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first case of COVID-19 in India, which originated from China, was reported on 30 January 2020. India currently has the largest number of confirmed cases in Asia,[8] and has the third highest number of confirmed cases in the world after the United States and Brazil[9] with the number of total confirmed cases breaching the 100,000 mark on 19 May, 200,000 on 3 June, [10][11] and 1,000,000 confirmed cases on 17 July 2020. On 29 August 2020, India recorded the global highest spike in COVID-19 cases on a day with 78, 761 cases surpassing the previous global highest daily spike of  77, 368 cases which was recorded in the US on 17 July 2020.[12][13]
India's case fatality rate is among the lowest in the world at 2.41% as of 23 July and is steadily declining.[14]

## 3. Create a dictionary table of weighted frequencies of words

In [14]:
def create_freq_dictionary(content):
    
    #stop words
    stop_words = set(stopwords.words('english'))
    
    words = word_tokenize(content)
    
    #Instantiate stemmer 
    
    stemmer = PorterStemmer()
    
    freq_dict = defaultdict(int)
    
    weighted_freq_dict = dict()
    
    #Build the dictionary by looping through the words
    
    for word in words:
        
        if word in stop_words:
            continue
        freq_dict[word]+=1
        
    #Find the maximum value of frequencies to find the weighted values
    dict_values = freq_dict.values()
    
    max_value = max(dict_values)
    
    for word in freq_dict.keys():
        weighted_freq_dict[word] = freq_dict[word]/max_value
        
    return weighted_freq_dict
    

## 4. Calculate scores for each sentence 

In [15]:
def get_sentence_scores(sentences,weighted_word_frequency_dict):
    
    sentence_score = defaultdict(int)
    
    for sentence in sentences:
        sentence_length = 0
        
        for word in weighted_word_frequency_dict:
            if word in sentence.lower():
                sentence_length = sentence_length + 1
                
                sentence_score[sentence[:10]] += weighted_word_frequency_dict[word]
        
        #Normalize the sentence weight by the length of the sentence
        sentence_score[sentence[:10]] = sentence_score[sentence[:10]]/sentence_length
    
    return sentence_score
    

## 5. Set the threshold value = Average sentence score

In [24]:
def get_average_sentence_score(sentence_score):
    
    sum_scores = 0 
    
    for sent , score in sentence_score.items():
        sum_scores += score
        
    average_score = sum_scores/len(sentence_score)
    
    return average_score

## 6. Generate Summary

In [18]:
def get_summary_sentences(sentences,sentence_score,threshold):
    
    num_sentences = 0
    
    summary = ''
    
    for sentence in sentences:
        if sentence[:10] in sentence_score:
            if sentence_score[sentence[:10]] >= threshold:
                num_sentences = num_sentences + 1
                summary += sentence
                
    print("The number of sentences in the summary : ",num_sentences)
    
    return summary

In [26]:
#Generate the summary by calling the above utility functions
def generate_summary(content):
    
    #Build weighted dictionary of words
    weighted_word_frequency_dict = create_freq_dictionary(content)
    
    #Tokenize content into sentences
    
    sentences = sent_tokenize(content)
    
    print("The number of sentences in the original article : " , len(sentences))
    
    #Compute sentence_scores
    
    sentence_scores = get_sentence_scores(sentences,weighted_word_frequency_dict)
    
    #Get the threshold score
    
    threshold = get_average_sentence_score(sentence_scores)
    
    #Create the summary
    
    summary = get_summary_sentences(sentences,sentence_scores,threshold)
    
    return summary
    

In [32]:
#Main function 

if __name__ == '__main__':
    summary = generate_summary(content)
    
    print("Generated Summary : ")
    
    print(summary)

The number of sentences in the original article :  332
The number of sentences in the summary :  171
Generated Summary : 
[14] By mid-May 2020, six cities accounted for around half of all reported cases in the country – Mumbai, Delhi, Ahmedabad, Chennai, Pune and Kolkata.[15] As of 24 May 2020, Lakshadweep is the only region which has not reported a case.On 10 June, India's recoveries exceeded active cases for the first time.[16]
On 22 March, India observed a 14-hour voluntary public curfew at the instance of Prime Minister Narendra Modi.Apart from these, no significant rise in transmissions was observed in February.[37][38]
In March, the transmissions grew after several people with travel history to affected countries, and their contacts, tested positive.[39]
A Sikh preacher, who had a travel history to Italy and Germany, turned into a "super spreader" by attending a Sikh festival in Anandpur Sahib during 10–12 March.[40][41] Twenty-seven COVID-19 cases were traced back to him.[42] Ov