<a href="https://colab.research.google.com/github/vivekverma1019/Extractive-Text-Summarizer/blob/master/Extractive_Text_Summarizer_using_Pretrained_Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import Required libraries

In [0]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
import bs4 as bs  
import urllib.request 

#### Read the data

In [0]:
# Scrape and read the data from Wikipedia
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scraped_data.read()

In [0]:
# Parse the data
parsed_article = bs.BeautifulSoup(article,'lxml')

In [0]:
# Retrieve text from parsed_article
paragraphs = parsed_article.find_all('p')  # Enclosed in <p> </p>

In [0]:
# Combining paragraphs to recreate the article
article_text = ""
for p in paragraphs:  
    article_text += p.text

In [0]:
# Replaing Square brackets(refrences) with single space
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  

In [0]:
# Replacing multiple spaces by single space
article_text = re.sub(r'\s+', ' ', article_text)

In [0]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

In [0]:
import nltk
nltk.download('punkt')
sentences = nltk.sent_tokenize(article_text)  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
len(sentences)

541

#### Download Glove Word Embeddings

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2019-04-30 14:55:17--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-04-30 14:55:17--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2019-04-30 14:56:47 (9.22 MB/s) - ‘glove.6B.zip.1’ saved [862182613/862182613]

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.200d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [0]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [0]:
len(word_embeddings)

400000

In [0]:
word_embeddings['water']

array([-0.5525   ,  0.80299  ,  0.24846  , -0.19954  ,  0.10702  ,
       -0.32201  ,  0.29462  ,  1.025    ,  0.12184  , -0.27206  ,
        0.12932  ,  0.10462  , -0.16654  ,  0.16812  ,  0.22872  ,
       -1.3154   , -0.081805 , -0.33495  , -0.42493  , -0.51661  ,
       -0.75545  ,  0.48928  ,  0.37171  , -0.089842 , -0.081205 ,
        0.6586   , -1.0971   , -0.35931  , -0.38722  ,  0.10288  ,
       -0.20377  , -0.12213  , -0.19528  ,  0.19447  , -0.2256   ,
        0.18176  , -0.093331 ,  0.49025  ,  0.067357 ,  0.17799  ,
       -0.46447  , -1.0855   , -1.144    , -1.0351   ,  1.2463   ,
        1.0647   , -0.98562  , -0.10586  , -0.21731  , -0.56261  ,
       -0.042496 ,  0.15401  ,  0.67673  ,  1.375    , -0.33502  ,
       -2.21     ,  0.045662 ,  0.031881 ,  2.0783   ,  0.22467  ,
        0.091217 ,  0.1973   , -0.082423 ,  1.0019   ,  1.3352   ,
        0.70781  ,  0.72513  , -1.3358   ,  0.68266  , -1.0511   ,
       -0.14352  , -0.27483  ,  1.0577   ,  0.34072  ,  0.6165

#### Text Preprocessing

In [0]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [0]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [0]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

#### Vector Representation of Sentences

In [0]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [0]:
len(clean_sentences)

541

In [0]:
len(sentence_vectors)

541

#### Similarity Matrix

In [0]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

#### Applying PageRank Algorithm

In [0]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

#### Summary Extraction

In [0]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [0]:
ranked_sentences[1][1]

'Some of the "learners" described below, including Bayesian networks, decision trees, and nearest-neighbor, could theoretically, (given infinite data, time, and memory) learn to approximate any function, including which combination of mathematical functions would best describe the world.'

In [0]:
# Extract top 7 sentences as the summary
extraction = []
for i in range(7):
  extraction.append(ranked_sentences[i][1])

In [0]:
summary = " ".join(extraction)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
text_file = open("Text_summarization_2.txt", "w")
text_file.write('Text '+str((len(article_text.split())))+' :\n' + article_text + '\n\nSummary '+str(len(summary.split()))+' :\n' + summary)
text_file.close()

In [0]:
import os
os.getcwd()

'/content'

In [0]:
os.listdir()

['.config',
 'glove.6B.100d.txt',
 'Text_summarization_2.txt',
 'glove.6B.zip',
 'glove.6B.50d.txt',
 'glove.6B.200d.txt',
 'glove.6B.300d.txt',
 'gdrive',
 'glove.6B.zip.1',
 'tennis_articles_v4.csv',
 'sample_data']

In [0]:
os.chdir('/content/gdrive/My Drive/')

In [0]:
os.getcwd()

'/content/gdrive/My Drive'

In [0]:
text_file = open("Text_summarization_2.txt", "w")
text_file.write('Text '+str((len(article_text.split())))+' :\n' + article_text + '\n\nSummary '+str(len(summary[0].split()))+' :\n' + summary[0])
text_file.close()