#PROBLEM 2: Extractive Summarization
Implement the KL-Sum summarization method for each dataset. Follow the ideas in this paper ; you are allowed to use libraries for text cleaning, segmentation into sentences, etc. Run it twice :<br>
A) KL_summary based on words_PD; PD is a distribution proportional to counts of words in document<br>
B) LDA_summary based on LDA topics_PD on obtained in PB2. The only difference is that PD, while still a distribution over words, is computed using topic modeling<br>
For DUC dataset evaluate KL_summaries and LDA_summaries against human gold summaries with ROUGE. ROUGE Perl package. Use the "Abstract" part of the files ins folder "Summaries" as the gold summaries.

EXTRA CREDIT. KL Summarization: Can we make both PD and PS distributions over topics, instead of distributions over words? Would that help?

In [None]:
!pip install rouge

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet
import scipy.sparse as sp
from nltk.stem import WordNetLemmatizer
import os
import statistics
from rouge import Rouge
import math
from collections import Counter

In [2]:
path = '/content/drive/MyDrive/DUC/dataset.pkl'
with open(path, 'rb') as file:
    dataset = pickle.load(file)

introductions = []
file_name = []
abstracts = []

for data in dataset:
  file_name.append(data)
  introductions.append(dataset[data]['introduction'])
  abstracts.append(dataset[data]['abstract'])


In [15]:
lengths = []
unique_lengths = []
for abstract in abstracts:
  lengths.append(len(abstract.split()))
  unique_lengths.append(len(set(abstract.split())))

In [169]:
# Create summary of 100 words
mean_value = statistics.mean(lengths)
median_value = statistics.median(lengths)
quartiles = np.percentile(lengths, [25, 50, 75,90])

print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"25th percentile (Q1): {quartiles[0]}")
print(f"50th percentile (Q2 or Median): {quartiles[1]}")
print(f"75th percentile (Q3): {quartiles[2]}")
print(f"95th percentile (Q3): {quartiles[3]}")
print(f"Max: {max(lengths)}")
print(f"Min: {min(lengths)}")

Mean: 99.65016501650165
Median: 101
25th percentile (Q1): 98.0
50th percentile (Q2 or Median): 101.0
75th percentile (Q3): 103.0
95th percentile (Q3): 106.0
Max: 121
Min: 0


In [17]:
mean_value = statistics.mean(unique_lengths)
median_value = statistics.median(unique_lengths)
quartiles = np.percentile(unique_lengths, [25, 50, 75,90])

print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"25th percentile (Q1): {quartiles[0]}")
print(f"50th percentile (Q2 or Median): {quartiles[1]}")
print(f"75th percentile (Q3): {quartiles[2]}")
print(f"95th percentile (Q3): {quartiles[3]}")
print(f"Max: {max(unique_lengths)}")

Mean: 76.6963696369637
Median: 78
25th percentile (Q1): 73.0
50th percentile (Q2 or Median): 78.0
75th percentile (Q3): 82.0
95th percentile (Q3): 85.0
Max: 98


In [171]:
for i in range(len(lengths)):
  if lengths[i] == 0:
    print(i)

45
194
299


In [None]:
abstracts.pop(45)
abstracts.pop(194)
abstracts.pop(229)
introductions.pop(45)
introductions.pop(194)
introductions.pop(229)

In [40]:
def preprocess(text):
  def is_valid_word(word):
    return bool(wordnet.synsets(word))
  text = re.sub(r'[^a-zA-Z0-9 \n]', '', text) # removing any character that is not an alphanumeric character (letters and digits), a space, or a newline (\n)
  text = re.sub(r'\n+', ' ', text) # removing new line
  text = text.lower() # lower casing
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  tokens = [word for word in tokens if word not in stop_words] # removing stop words
  filtered_tokens = [word for word in tokens if is_valid_word(word)] # removing non-english word
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens] # Lemmatization

  return ' '.join(lemmatized_tokens)

In [122]:
def kl_div(PD,PS):
  sum_val = 0
  for w in PS:
    if w in PD:
      sum_val += PD[w] * math.log(PD[w] / PS[w])
  return sum_val

In [83]:
def get_word_freq(doc):
  word_counts = Counter(doc)
  return {word: count for word, count in word_counts.items()}

In [92]:
def get_topic_freq(doc):
  lda= LatentDirichletAllocation(n_components=1, random_state=0, n_jobs=-1)
  vectorizer = CountVectorizer()
  doc_term_matrix = vectorizer.fit_transform([preprocess(doc)])
  lda.fit(doc_term_matrix)
  feature_names = vectorizer.get_feature_names_out()
  top_n = 85
  for idx, topic in enumerate(lda.components_):
    topic_words = {feature_names[i]: 1 for i in np.argsort(topic)[::-1][:top_n]}
  return topic_words

###KL-Summary

In [None]:
# KL_summary
# Given a document -> get preprocessed doc (rmv stop words lematize etc) -> prep_doc
# For each sentence s in prep_doc calculate KL_dist. Low KL_div => similar
# Get s with lowest KL_dist, add to summary, break if until size of summary >= 100 -> list(idx(s))
# Summary = document[list(idx(s))]

In [143]:
def get_kl_summary(doc):
  prep_doc = preprocess(doc)
  sent_list = doc.split('.')
  processed_sent = [preprocess(sent) for sent in sent_list]
  doc_pd = get_word_freq(prep_doc.split())

  summary = ''
  while(len(summary.split(' ')) <= 100 and len(processed_sent)>0):
    min_kl_div = 10000
    min_summary_sent_idx = 0

    # finding which sentence when added to summary has lowest kl-div with doc
    for idx, sent in enumerate(processed_sent):
      summary_pd = get_word_freq(summary+sent)

      if min_kl_div > kl_div(doc_pd,summary_pd):
        min_kl_div = kl_div(doc_pd,summary_pd)
        min_summary_sent_idx = idx

    processed_sent.pop(min_summary_sent_idx)
    summary += sent_list[min_summary_sent_idx]

  return summary

In [160]:
print(introductions[4],'\n')
print(get_kl_summary(introductions[4]),'\n')
print(abstracts[4])

SQUADS of workers fanned out across storm-battered Louisiana yesterday to
begin a massive rebuilding effort after Hurricane Andrew had flattened whole
districts, killing two people and injuring dozens more, agencies report from
Florida and New Orleans.
However, local officials in Florida, hit earlier in the week by the
hurricane, were critical of what they called a delay in supplying food,
drinking water and other supplies for thousands of people in need.
Federal emergency officials acknowledged distribution problems,
Transportation Secretary Andrew Card yesterday promised 'dramatic'
improvements within 24 hours and President George Bush last night ordered
troops to Florida, without specifying a number.
The government estimated it would cost Dollars 20bn-Dollars 30bn to tidy and
rebuild in Florida, and to care for residents displaced by the storm.
Louisiana state officials said they had no overall count of storm-related
injuries but initial estimates reckoned fewer than 100. The Federa

###Evaluation

In [180]:
rouge = Rouge()
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for idx, doc in enumerate(introductions):
  summary = get_kl_summary(doc)
  scores = rouge.get_scores(summary, abstracts[idx])
  rouge1_scores.append(scores[0]['rouge-1']['f']) # f score
  rouge2_scores.append(scores[0]['rouge-2']['f'])
  rougeL_scores.append(scores[0]['rouge-l']['f'])

In [182]:
print(f"Avg Rouge1: {statistics.mean(rouge1_scores)}")
print(f"Avg Rouge2: {statistics.mean(rouge2_scores)}")
print(f"Avg RougeL: {statistics.mean(rougeL_scores)}")

Avg Rouge1: 0.2873822469180531
Avg Rouge2: 0.11254739557824144
Avg RougeL: 0.24954993199899816
