In [None]:
! pip install rouge

In [None]:
import math
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy.linalg import svd as singular_value_decomposition
from nltk.corpus import stopwords
from operator import attrgetter
from collections import namedtuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import normalize
from rouge import Rouge
import statistics
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
#Please enter the path of new_summary.csv file
df = pd.read_csv("news_summary.csv",encoding='iso-8859-1')

In [None]:
df.head(10)

In [None]:
df['article'] = df['ctext']
df['summary'] = df['text']

Remove extra features like author, date, article link which does not affect news summary

Drop the null values and reset index

In [None]:
df.drop(['author','date','read_more','text','ctext'],axis=1,inplace=True)
df.dropna(inplace=True)
df.reset_index(inplace = True)

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
def lemmatize_tokenize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def join_words(lst):
  return ' '.join(lst)

Performing stemming and space tokenization to clean the article and summary

In [None]:
df['article'] = df['article'].apply(lemmatize_tokenize_text)
df['summary'] = df['summary'].apply(lemmatize_tokenize_text)

In [None]:
df['article'] = df['article'].apply(join_words)
df['summary'] = df['summary'].apply(join_words)

In [None]:
df.info()

In [None]:
df['article'][1]

In [None]:
df['summary'][1]

In [None]:
stop_words = list(stopwords.words('english'))
MIN_DIMENSIONS = 3
REDUCTION_RATIO = 1/5
SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))


In [None]:
#Creating word dictionary, where key is the word and value is the row index
#We also remove the stop words before adding them in dictionary and change everyone to lowercase
def to_lower(word):
  return word.lower()

def create_dictionary(article):
    words = word_tokenize(article)
    words = tuple(words)
    words = map(to_lower,words)
    unique_words = frozenset(w for w in words if w not in stop_words)

    return dict((w, i) for i, w in enumerate(unique_words))

In [None]:
#Create the word document matrix using text article and its corresponding dictionary
#Sentance tokenize the article and then store the frequency(stored in dictionary) for words corresponding to each sentence
def create_matrix(article, dictionary):
    sentences = sent_tokenize(article)
    words_count = len(dictionary)
    sentences_count = len(sentences)
    matrix = np.zeros((words_count, sentences_count))
    for col, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        for word in words:
            # only valid words is counted (not stop-words, ...)
            if word in dictionary:
                row = dictionary[word]
                matrix[row, col] += 1

    return matrix 


In [None]:
#Normalize the matrix by dividing each column with its max value
def compute_term_freq(matrix):
    smooth=0.4
    max_word_frequencies = np.max(matrix, axis=0)
    rows, cols = matrix.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix[row, col]/max_word_frequency
                matrix[row, col] = smooth + (1.0 - smooth)*frequency

    return matrix

In [None]:
#Perform truncated SVD by extracting the top topics and then multiplying them to get the ranks
#We extract the top columns(columns having highest values) and multiply them with v_matrix
def compute_rank(sigma, v_matrix):
    dimensions = max(MIN_DIMENSIONS,int(len(sigma)*REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0
        for i, s in enumerate(sigma))

    ranks = []
    
    for column_vector in v_matrix.T:
        rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))

    return ranks

In [None]:
class ItemsCount(object):
    def __init__(self, value):
        self._value = value

    def __call__(self, sequence):
        if isinstance(self._value, (bytes, str,)):
            if self._value.endswith("%"):
                total_count = len(sequence)
                percentage = int(self._value[:-1])
                # at least one sentence should be chosen
                count = max(1, total_count*percentage // 100)
                return sequence[:count]
            else:
                return sequence[:int(self._value)]
        elif isinstance(self._value, (int, float)):
            return sequence[:int(self._value)]

    def __repr__(self):
        return to_string("<ItemsCount: %r>" % self._value)

In [None]:
#Returns the top sentences on the basis of their rating
def get_top_sentence(sentences, count, rating, *args, **kwargs):
    rate = rating
    if isinstance(rating, dict):
        rate = lambda s: rating[s]
    
    infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
        for o, s in enumerate(sentences))
    # sort sentences by rating in descending order
    infos = sorted(infos, key=attrgetter("rating"), reverse=True)
    # get `count` first best rated sentences
    if not isinstance(count, ItemsCount):
        count = ItemsCount(count)
    infos = count(infos)
    # sort sentences by their order in document
    infos = sorted(infos, key=attrgetter("order"))

    return tuple(i.sentence for i in infos)

In [None]:
#Perform LSA Summarization
def text_summarizer(article,summary_len=1):
  article = str(article)
  stop_words = list(stopwords.words('english'))
  dictionary = create_dictionary(article)
  sentences = sent_tokenize(article)
  matrix = create_matrix(article,dictionary)
  matrix = compute_term_freq(matrix)
  u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
  ranks = iter(compute_rank(sigma, v))
  summarized_sentance = get_top_sentence(sentences,summary_len,lambda s: next(ranks))
  return ' '.join(summarized_sentance)

We have assumed the best predicted summarized sentence(summary of length 1) to be our predicted_headline.

We have applied text summarization on article to get predicted_summary of length 3.

In [None]:
df['predicted_headline'] = df['article'].apply(text_summarizer)

In [None]:
df['predicted_summary'] = df['article'].apply(lambda x : text_summarizer(x,3))

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
#This function calculates the SVD similarity between 2 texts.
#We first create matrix of both the texts and then factorize them
#Then the u matrix is normalized and its 1st column is multiplied to get the topic similarity between 2 different texts
def svd_similarity(text1,text2):
  text1 = str(text1)
  dictionary = create_dictionary(text1)
  sentences = sent_tokenize(text1)
  matrix = create_matrix(text1,dictionary)
  matrix = compute_term_freq(matrix)
  u1, sigma1, v1 = singular_value_decomposition(matrix, full_matrices=False) 
  text2 = str(text2)
  dictionary = create_dictionary(text2)
  sentences = sent_tokenize(text2)
  matrix = create_matrix(text1,dictionary)
  matrix = compute_term_freq(matrix)
  u2, sigma2, v2 = singular_value_decomposition(matrix, full_matrices=False) 
  u1 = u1[:,0].reshape((u1.shape[0],1))
  u2 = u2[:,0].reshape((u2.shape[0],1))
  normalized_u1 = normalize(u1, axis=0).ravel()
  normalized_u2 = normalize(u2, axis=0).ravel()
  similarity = 0
  for i in range(min(len(normalized_u1),len(normalized_u2))):
    similarity += (normalized_u1[i]*normalized_u2[i])
  return similarity

In [None]:
#We calculated cosine similarity between 2 different texts
def text_similarity(text1, text2):
  count_vector = CountVectorizer()
  corpus = [text1,text2]
  X_train_counts = count_vector.fit_transform(corpus)
  pd.DataFrame(X_train_counts.toarray(),columns=count_vector.get_feature_names_out (),index=['text1','text2'])
  vectorizer = TfidfVectorizer()
  trsfm=vectorizer.fit_transform(corpus)
  pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names_out (),index=['text1','text2'])
  return cosine_similarity(trsfm[0:1], trsfm)

In [None]:
# We store the different evaluation metrix for each row (i.e cosine similarity, svd similarity, rouge-l score)
headline_similarities = []
summary_similarities = []
svd_summary_similarities = []
svd_headline_similarites = []
f=[]
p=[]
r=[]
for i in range(len(df['headlines'])):
  ROUGE = Rouge()
  headline_similarity = text_similarity(str(df['headlines'][i]),str(df['predicted_headline'][i]))
  summary_similarity = text_similarity(str(df['summary'][i]),df['predicted_summary'][i])
  svd_summary_similarity = svd_similarity(df['summary'][i],df['predicted_summary'][i])
  svd_headline_similarity = svd_similarity(df['headlines'][i],df['predicted_headline'][i])
  f.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['f'])
  p.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['p'])
  r.append(ROUGE.get_scores(df['summary'][i],df['predicted_summary'][i])[0]['rouge-l']['f'])
  headline_similarities.append(headline_similarity[0][1])
  summary_similarities.append(summary_similarity[0][1])
  svd_summary_similarities.append(svd_summary_similarity)
  svd_headline_similarites.append(svd_headline_similarity)

## Evaluation Metrices
* Headline similarity scores tell us that using LSA summarization to predict headline is not a good aprroach.
* While summary similarity scores shows that summary generated by LSA is nearly 76% similar to actual summary.
* Cosine similarity is not correct way to evaluate the summary of article because it compares on the basis of words.
* Rouge-l scores are used to evaluate abstractive summaries, while it is not a good evaluation metric for extractive summaries.
* SVD similarity is more aprropriate evaluation metric for comparing the actual and predicted summary because we compare the summaries topic wise i.e how close they are to the same topic

In [None]:
print("Cosine similarity scores")
print("Mean Headline similarity score: ",statistics.mean(headline_similarities))
print("Median Headline similarity score: ",statistics.mean(headline_similarities))
print("Mean summary similarity score: ",statistics.mean(summary_similarities))
print("Median summary similarity score: ",statistics.median(summary_similarities))

In [None]:
print("SVD similarity scores")
print("Mean Headline similarity score: ",statistics.mean(svd_headline_similarites))
print("Median Headline similarity score: ",statistics.mean(svd_headline_similarites))
print("Mean summary similarity score: ",statistics.mean(svd_summary_similarities))
print("Median summary similarity score: ",statistics.median(svd_summary_similarities))

In [None]:
print("ROUGE scores")
print("Mean summary similarity F1score: ",statistics.mean(f))
print("Medaian summary similarity F1score: ",statistics.median(f))
print("Mean summary similarity precision score: ",statistics.mean(p))
print("Medaian summary similarity precision score: ",statistics.median(p))
print("Mean summary similarity recall score: ",statistics.mean(r))
print("Medaian summary similarity recall score: ",statistics.median(r))