# Text Summarization using TF/IDF and TextRank Algorithm

# TF-IDF

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
import math
import re
import operator
import numpy as np

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Initializing WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
Stopwords = set(stopwords.words('english'))
wordlemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Text preprocessing

In [None]:
#Lemmatization
def lemmatize_words(words):
    lemmatized_words = []
    for word in words:
        lemmatized_words.append(wordlemmatizer.lemmatize(word))
    return lemmatized_words

In [None]:
# To Remove special characters
def remove_special_characters(text):
    regex = r'[^a-zA-Z0-9\s]'
    text = re.sub(regex,'',text)
    return text

In [None]:
#	Lowercasing
def freq(words):
    words = [word.lower() for word in words]
    dict_freq = {}
    words_unique = []
    for word in words:
        if word not in words_unique:
            words_unique.append(word)
    for word in words_unique:
        dict_freq[word] = words.count(word)
    return dict_freq

In [None]:
#	POS Tagging
def pos_tagging(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
             pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb

# TF-IDF score functions

In [None]:
# TF Score function
def tf_score(word,sentence):
    freq_sum = 0
    word_frequency_in_sentence = 0
    len_sentence = len(sentence)
    for word_in_sentence in sentence.split():
        if word == word_in_sentence:
            word_frequency_in_sentence = word_frequency_in_sentence + 1
    tf =  word_frequency_in_sentence/ len_sentence
    return tf

In [None]:
# IDF Score function
def idf_score(no_of_sentences,word,sentences):
    no_of_sentence_containing_word = 0
    for sentence in sentences:
        sentence = remove_special_characters(str(sentence))
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.split()
        sentence = [word for word in sentence if word.lower() not in Stopwords and len(word)>1]
        sentence = [word.lower() for word in sentence]
        sentence = [wordlemmatizer.lemmatize(word) for word in sentence]
        if word in sentence:
            no_of_sentence_containing_word = no_of_sentence_containing_word + 1
    idf = math.log10(no_of_sentences/no_of_sentence_containing_word)
    return idf

In [None]:
# TF-IDF function
def word_tfidf(dict_freq,word,sentences,sentence):
    word_tfidf = []
    tf = tf_score(word,sentence)
    idf = idf_score(len(sentences),word,sentences)
    return tf*idf

# Finding most important sentences and Generating summary

In [None]:
def sentence_importance(sentence,dict_freq,sentences):
    sentence_score = 0
    sentence = remove_special_characters(str(sentence)) 
    sentence = re.sub(r'\d+', '', sentence)
    pos_tagged_sentence = [] 
    no_of_sentences = len(sentences)
    pos_tagged_sentence = pos_tagging(sentence)
    for word in pos_tagged_sentence:
        if word.lower() not in Stopwords and word not in Stopwords and len(word)>1: 
            word = word.lower()
            word = wordlemmatizer.lemmatize(word)
            sentence_score = sentence_score + word_tfidf(dict_freq,word,sentences,sentence)
    return sentence_score

Importing Dataset : An artcle from Google News

In [None]:
file = open('/content/sample.txt' , 'r')
text = file.read()
tokenized_sentence = sent_tokenize(text)
text = remove_special_characters(str(text))
text = re.sub(r'\d+', '', text) 

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Percentage of information to retain:

I took 30% in my summary

In [None]:
tokenized_words_with_stopwords = word_tokenize(text)
tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords]
tokenized_words = [word for word in tokenized_words if len(word) > 1]
tokenized_words = [word.lower() for word in tokenized_words]
tokenized_words = lemmatize_words(tokenized_words)

In [None]:
word_freq = freq(tokenized_words)
print(word_freq)
input_user = int(input('Percentage of information to retain(in percent):'))
no_of_sentences = int((input_user * len(tokenized_sentence))/100)
print(no_of_sentences)

{'after': 1, 'month': 1, 'drama': 1, 'testy': 1, 'statement': 1, 'bit': 1, 'trolling': 1, 'musk': 10, 'reveal': 1, 'juicy': 1, 'text': 1, 'message': 1, 'elon': 5, 'finally': 1, 'closing': 1, 'deal': 4, 'purchase': 2, 'twitter': 11, 'billion': 1, 'now': 1, 'completed': 1, 'according': 1, 'report': 1, 'already': 1, 'gone': 1, 'senior': 1, 'leadership': 2, 'it': 1, 'wellknown': 1, 'monthslong': 1, 'saga': 1, 'love': 1, 'lost': 1, 'ceo': 3, 'parag': 1, 'agrawal': 6, 'also': 1, 'doesnt': 1, 'like': 2, 'top': 1, 'leader': 1, 'including': 1, 'company': 3, 'policy': 1, 'chief': 1, 'vijaya': 1, 'gadde': 2, 'cfo': 1, 'ned': 1, 'segal': 1, 'so': 1, 'three': 1, 'fired': 1, 'probably': 1, 'along': 1, 'at': 1, 'first': 1, 'glance': 1, 'look': 2, 'win': 1, 'sparred': 1, 'almost': 1, 'public': 1, 'privately': 1, 'since': 1, 'launched': 1, 'bid': 1, 'get': 1, 'stake': 1, 'but': 2, 'closer': 1, 'clear': 1, 'winner': 1, 'tussle': 1, 'sure': 1, 'losing': 1, 'job': 1, 'significantly': 1, 'even': 2, 'loses'

In [None]:
c = 1
sentence_with_importance = {} #making a dictionary of the most important sentences
for sent in tokenized_sentence:
    sentenceimp = sentence_importance(sent,word_freq,tokenized_sentence)
    sentence_with_importance[c] = sentenceimp
    c = c+1
sentence_with_importance = sorted(sentence_with_importance.items(), key=operator.itemgetter(1),reverse=True)


In [None]:
cnt = 0
sentence_no = []
for word_prob in sentence_with_importance:
    if cnt < no_of_sentences:
        sentence_no.append(word_prob[0])
        cnt = cnt+1
    else:
        break
sentence_no.sort()


In [None]:
count = 1
summary=""
for sentence in tokenized_sentence:
    if count in sentence_no:
        summary+= sentence+" "
    count+=1
print("\n")
print("Summary:")
print(summary)



Summary:
After months of drama, some testy statements, a bit of trolling from Musk and the reveal of some juicy text messages, Elon Musk is finally closing his deal to purchase Twitter for some $44 billion. Now as he completed the deal, according to reports, Musk has already gone after the senior leadership of Twitter. It is well-known after the months-long saga that there is no love lost between Elon Musk and Twitter CEO Parag Agrawal. At first glance it looks like a "win" for Elon Musk, who has sparred -- almost in public with Gadde and privately with Agrawal -- with Twitter leadership since he launched his bid to get a stake in the company. Sure, he is losing his job as the CEO of Twitter. Keep in mind that Agrawal has forced him to purchase Twitter even when Musk tried to walk away from the deal. Personally too, Agrawal is getting something out of his firing. When he became the CEO of Twitter last year in December, he did so with a clause in his contract that guaranteed him a sev

# TextRank Algorithm

GloVe word embeddings are vector representation of words. These word embeddings will be used to create vectors for our sentences. We could have also used the Bag-of-Words or TF-IDF approaches to create features for our sentences, but these methods ignore the order of the words (and the number of features is usually pretty large).

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2022-10-29 05:40:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-10-29 05:40:03--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-10-29 05:40:03--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
#Word Embeddings using Glove
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()


In [None]:
# Sentence Vectors:
sentence_vectors = []
for i in tokenized_sentence:
  
  v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split()))
  sentence_vectors.append(v)

Similarity Matrix and Cosine Similarity:

In [None]:
# similarity matrix
sim_mat = np.zeros([len(tokenized_sentence), len(tokenized_sentence)])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for i in range(len(tokenized_sentence)):
  for j in range(len(tokenized_sentence)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))

Scores:

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
print(scores)




{0: 0.07720467818775745, 1: 0.07734935189459856, 2: 0.07668964819450208, 3: 0.07489964726247701, 4: 0.07626792848046823, 5: 0.07772990681380341, 6: 0.07733549841157401, 7: 0.07705044704704528, 8: 0.07781480820247667, 9: 0.0768813022045021, 10: 0.07638155985670204, 11: 0.0777200849204418, 12: 0.07667513852365136}


In [None]:
scoresrank =  sorted(scores.items(), key=operator.itemgetter(1),reverse=True)
print(scoresrank)

[(8, 0.07781480820247667), (5, 0.07772990681380341), (11, 0.0777200849204418), (1, 0.07734935189459856), (6, 0.07733549841157401), (0, 0.07720467818775745), (7, 0.07705044704704528), (9, 0.0768813022045021), (2, 0.07668964819450208), (12, 0.07667513852365136), (10, 0.07638155985670204), (4, 0.07626792848046823), (3, 0.07489964726247701)]


In [None]:
# Specify number of sentences to form the summary
sn = 4
# Generate summary
for i in range(sn):
  print(tokenized_sentence[scoresrank[i][0]])

But significantly, even as he loses something, he is gaining a lot because he has forced Elon Musk to go through a deal that is extremely lucrative for Twitter shareholders.
At first glance it looks like a "win" for Elon Musk, who has sparred -- almost in public with Gadde and privately with Agrawal -- with Twitter leadership since he launched his bid to get a stake in the company.
When he became the CEO of Twitter last year in December, he did so with a clause in his contract that guaranteed him a severance fee of $42 million if Twitter asked him to go before one year was over.
Now as he completed the deal, according to reports, Musk has already gone after the senior leadership of Twitter.
