In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
# Extract word vectors
word_embeddings = {}
f = open('/Users/subir/Downloads/glove/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [3]:

def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
#     article = filedata[0].split(". ")
    sentences = []
    for sentence in filedata:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    return sentences

def sentence_similarity_glove(sent1 ,sent2, stopwords=None):
    '''
    first fetch vectors (each of size 100 elements) for the constituent words in a sentence and 
    then take mean/average of those vectors to arrive at a consolidated vector for the sentence.
    '''
    sent1_vec = sum([word_embeddings.get(w, np.zeros((300,))) for w in sent1])/(len(sent1)+0.001)
    sent2_vec = sum([word_embeddings.get(w, np.zeros((300,))) for w in sent2])/(len(sent2)+0.001)
    
    return 1 - cosine_distance(sent1_vec, sent2_vec)
    
    

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * 300
    vector2 = [0] * 300

    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)
 

In [4]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []
    sentences =  read_article(file_name)
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
        
    print("Summarize Text: \n \n", ". ".join(summarize_text))

generate_summary("text_summarizer.txt", 5)

Summarize Text: 
 
 NEW DELHI : India is not looking at evacuating its diplomats from Iran, one of the main hotspots of the Coronavirus outbreak where the death toll was threatening to cross 1,000, a person familiar with the development said Tuesday.
. At a press briefing in New Delhi when asked about news reports that as many as 250 Indians in Iran had tested positive for COVID-19, Dammu Ravi, additional secretary and coordinator for COVID-19 in the Indian foreign ministry said given the widespread prevalence of the disease in Iran “it is quite possible" that there “could" be some positive cases.
. India has so far evacuated more than 1,400 Indians from COVID-19 hotspots like the epicenter of the outbreak ie the Chinese city of Wuhan, Japan, Iran and Italy. India has recorded three deaths in the country due to SARS-CoV2 with 137 people infected with the disease. Worldwide, more than 180,000 infections have been recorded in 155 countries with more than 7,000 deaths since December.
. Me

In [5]:
# def build_similarity_matrix(sentences, stop_words):
#     similarity_matrix = np.zeros((len(sentences), len(sentences)))
#     for idx1 in range(len(sentences)):
#         for idx2 in range(len(sentences)):
#             if idx1 == idx2:
#                 continue 
#             similarity_matrix[idx1][idx2] = sentence_similarity_glove(sentences[idx1], sentences[idx2], stop_words)

#     return similarity_matrix

# def generate_summary(file_name, top_n=5):
#     stop_words = stopwords.words('english')
#     summarize_text = []
#     sentences =  read_article(file_name)
    
#     sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
#     sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
#     scores = nx.pagerank(sentence_similarity_graph,max_iter=5000,alpha=0.1)
#     ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
#     for i in range(top_n):
#         summarize_text.append(" ".join(ranked_sentence[i][1]))
        
#     print("Summarize Text: \n\n", ". ".join(summarize_text))

# generate_summary("text_summarizer.txt", 5)

In [6]:
from summarizer import Summarizer
from textblob import TextBlob



In [7]:
file = open("text_summarizer.txt", "r")
filedata = file.read()

In [8]:
model = Summarizer()

In [9]:
result = model(filedata, min_length=30)
full = ''.join(result)
print(full)
testimonial = TextBlob(full)
print('\n Polarity of Article:',testimonial.sentiment.polarity)

COVID-19: India says not looking at evacuating diplomats from Iran 2 min read . Updated: 17 Mar 2020, 08:12 PM IST Elizabeth Roche
In Iran, the Coronavirus death toll is threatening to cross 1,000
Home secretary Ajay Bhalla reviews India’s preparedness to contain the COVID-19 threat along borders with Nepal, Bhutan, Bangladesh and Myanmar Topics
Coronavirus outbreak COVID-19Iran
NEW DELHI : India is not looking at evacuating its diplomats from Iran, one of the main hotspots of the Coronavirus outbreak where the death toll was threatening to cross 1,000, a person familiar with the development said Tuesday. “But I cannot confirm to you if the list which is being circulated in the social media is correct.

 Polarity of Article: 0.11856060606060605


In [24]:
result = model(filedata, min_length=30,algorithm='gmm')
full = ''.join(result)
print(full)
testimonial = TextBlob(full)
print('\n Polarity of Article:',testimonial.sentiment.polarity)

COVID-19: India says not looking at evacuating diplomats from Iran 2 min read . Updated: 17 Mar 2020, 08:12 PM IST Elizabeth Roche
In Iran, the Coronavirus death toll is threatening to cross 1,000
Home secretary Ajay Bhalla reviews India’s preparedness to contain the COVID-19 threat along borders with Nepal, Bhutan, Bangladesh and Myanmar Topics
Coronavirus outbreak COVID-19Iran
NEW DELHI : India is not looking at evacuating its diplomats from Iran, one of the main hotspots of the Coronavirus outbreak where the death toll was threatening to cross 1,000, a person familiar with the development said Tuesday. “But I cannot confirm to you if the list which is being circulated in the social media is correct. India has recorded three deaths in the country due to SARS-CoV2 with 137 people infected with the disease.

 Polarity of Article: 0.08376623376623375


In [17]:
target_url= "https://www.geeksforgeeks.org/greedy-algorithms/"

In [18]:
import  requests

In [19]:
f = requests.get(target_url)
print(f.text)

<!DOCTYPE html>
<!--[if IE 7]>
<html class="ie ie7" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if !(IE 7) | !(IE 8)  ]><!-->
<html lang="en-US" prefix="og: http://ogp.me/ns#" >

<!--<![endif]-->
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"> 

<link rel="shortcut icon" href="https://www.geeksforgeeks.org/favicon.ico" type="image/x-icon" />
<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
<meta name="theme-color" content="#0f9d58" />

<meta property="og:image" content="https://www.geeksforgeeks.org/wp-content/uploads/gfg_200X200.png">
<meta property="og:image:type" content="image/png">
<meta property="og:image:width" content="200">
<meta property="og:image:height" content="200">
<script src="https://apis.google.com/js/platform.js"></script>
<script src="//cdnjs.cloudf

In [20]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(f.text, 'html.parser')

In [21]:
soup

<!DOCTYPE html>

<!--[if IE 7]>
<html class="ie ie7" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if !(IE 7) | !(IE 8)  ]><!-->
<html lang="en-US" prefix="og: http://ogp.me/ns#">
<!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
<link href="https://www.geeksforgeeks.org/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet"/>
<meta content="#0f9d58" name="theme-color">
<meta content="https://www.geeksforgeeks.org/wp-content/uploads/gfg_200X200.png" property="og:image"/>
<meta content="image/png" property="og:image:type"/>
<meta content="200" property="og:image:width"/>
<meta content="200" property="og:image:height"/>
<script src="https://apis.google.com/js/platform.js"></script>
<script src="//cdnjs.cloudfla

In [22]:

text = soup.find_all(text=True)


In [23]:
text

['html',
 '\n',
 '[if IE 7]>\r\n<html class="ie ie7" lang="en-US" prefix="og: http://ogp.me/ns#">\r\n<![endif]',
 '\n',
 '[if IE 8]>\r\n<html class="ie ie8" lang="en-US" prefix="og: http://ogp.me/ns#">\r\n<![endif]',
 '\n',
 '[if !(IE 7) | !(IE 8)  ]><!',
 '\n',
 '\n',
 '<![endif]',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 'Greedy Algorithms - GeeksforGeeks',
 '\n',
 '\n',
 '\n',
 '[if lt IE 9]>\r\n<script src="https://www.geeksforgeeks.org/wp-content/themes/iconic-one/js/html5.js" type="text/javascript"></script>\r\n<![endif]',
 '\n',
 '\r\n    {\r\n        "@context" : "http://schema.org",\r\n        "@type" : "Organization",\r\n        "name" : "GeeksforGeeks",\r\n        "url" : "https://www.geeksforgeeks.org/",\r\n        "logo" : "https://www.geeksforgeeks.org/gfgLogo.png",\r\n        "description" : "A computer science portal for geeks. It contains well written, well thought and well explained computer science and programming ar