# 20ng

## Words approach

In [280]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.read_csv('20ng.csv')
df['index'] = df['index'].str.replace(r'\d+$', '', regex=True)
df.dropna(inplace=True)
df = df[:1000]

vectorizer = TfidfVectorizer(use_idf=True)
sparse_matrix = vectorizer.fit_transform(df['content'])

sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=vectorizer.get_feature_names_out())
sparse_df['index'] = df['index'].to_numpy()

In [282]:
words = sparse_df.columns[:-1]

In [287]:
from collections import Counter

def get_word_distribution(sentences, words):
    text = " ".join(sentences)
    word_list = text.split()
    counter = {}
    words_set = set(words)

    for i in words_set:
        if i not in counter:
            counter[i] = 0

    for word in word_list:
        if word not in counter:
            continue
        counter[word] += 1
    
    return counter


In [284]:
sparse_x = sparse_df.drop(columns=['index'])

In [285]:
row_array = sparse_x.iloc[0].to_numpy(dtype=float)

In [None]:
from tqdm import tqdm
import numpy as np
from nltk.tokenize import sent_tokenize

summaries = []

for df_i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Documents"):
    text = row['content']
    sentences = sent_tokenize(text)
    pd_vals = sparse_x.iloc[df_i].to_numpy(dtype=float)
    kl_vals = []
    epsilon = 1e-10
    sub_sentences = []
    

    for sentence_ind in tqdm(range(len(sentences)), desc=f"Processing Sentences for doc {df_i+1}", leave=False):
        sub_sentences.append(sentences[sentence_ind])
        counter = get_word_distribution(sub_sentences, words)
        ps_vals = np.array(list(counter.values()))
        ps_vals = ps_vals / ps_vals.sum()
        pd_vals_safe = np.clip(pd_vals, epsilon, None)
        ps_vals_safe = np.clip(ps_vals, epsilon, None)
        final_arr = pd_vals_safe * np.log(pd_vals_safe / ps_vals_safe)
        final_arr = np.nan_to_num(final_arr)
        kl_vals.append(np.sum(final_arr))

    kl_vals = np.array(kl_vals)
    summary = " ".join(sentences[0:np.argmin(kl_vals) + 1])
    summaries.append(summary)



  ps_vals = ps_vals / ps_vals.sum()
Processing Documents: 100%|██████████| 1000/1000 [06:20<00:00,  2.63it/s]


## Topics approach

In [302]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

df.dropna(inplace=True)
vectorizer = TfidfVectorizer(use_idf=False)

sparse_matrix = vectorizer.fit_transform(df['content'])
lda = LatentDirichletAllocation(n_components=30, random_state=42)
topics = lda.fit_transform(sparse_matrix)


In [303]:
topics = lda.fit_transform(sparse_matrix)

In [304]:

summaries = []
count = 0
for df_i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Documents"):
    text = df.loc[df_i, 'content']
    sentences = sent_tokenize(text)
    doc_topic_distribution = topics[count]
    curr_sentence = []
    kls = []

    for sentence in sentences:
        curr_sentence.append(sentence)
        curr_sentence_joined = " ".join(curr_sentence)
        sparse_matrix = vectorizer.transform([curr_sentence_joined])
        sub_sen_topic_distribution = lda.transform(sparse_matrix)
        kl_score = np.sum(doc_topic_distribution * np.log(doc_topic_distribution/ sub_sen_topic_distribution))
        kls.append(kl_score)
    kls = np.array(kls)
    summaries.append(" ".join(sentences[0:np.argmin(kls) + 1]))
    count += 1

Processing Documents: 100%|██████████| 1000/1000 [00:24<00:00, 40.85it/s]


# DUC2001

## Using words

In [309]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

df = pd.read_csv('DUC2001.csv')
df.dropna(inplace=True)

vectorizer = TfidfVectorizer(use_idf=True)
sparse_matrix = vectorizer.fit_transform(df['content'])

sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=vectorizer.get_feature_names_out())

In [310]:
sparse_df.shape

(300, 5819)

In [311]:
from tqdm import tqdm
summaries = []

for df_i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Documents"):
    text = row['content']
    sentences = sent_tokenize(text)
    pd_vals = sparse_x.iloc[df_i].to_numpy(dtype=float)
    kl_vals = []
    epsilon = 1e-10
    sub_sentences = []
    

    for sentence_ind in tqdm(range(len(sentences)), desc=f"Processing Sentences for doc {df_i+1}", leave=False):
        sub_sentences.append(sentences[sentence_ind])
        counter = get_word_distribution(sub_sentences, words)
        ps_vals = np.array(list(counter.values()))
        ps_vals = ps_vals / ps_vals.sum()
        pd_vals_safe = np.clip(pd_vals, epsilon, None)
        ps_vals_safe = np.clip(ps_vals, epsilon, None)
        final_arr = pd_vals_safe * np.log(pd_vals_safe / ps_vals_safe)
        final_arr = np.nan_to_num(final_arr)
        kl_vals.append(np.sum(final_arr))

    kl_vals = np.array(kl_vals)
    summary = " ".join(sentences[0:np.argmin(kl_vals) + 1])
    summaries.append(summary)


  ps_vals = ps_vals / ps_vals.sum()
Processing Documents: 100%|██████████| 300/300 [00:30<00:00,  9.90it/s]


In [312]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1'])
rouge_1 = []
expected_summaries = df['summary'].tolist()

for expected, generated in zip(expected_summaries, summaries):
    scores = scorer.score(expected, generated)
    rouge_1.append(scores['rouge1'].fmeasure)

avg_rouge_1 = sum(rouge_1) / len(rouge_1)
    
print(f"ROUGE-1:",  avg_rouge_1)



ROUGE-1: 0.2233361130154441


## Using topics

In [240]:
df.head()

Unnamed: 0,content,summary
0,Some 40 members of Congress have joined with ...,A coalition of members of Congress announced W...
1,"Multitudes of native peoples, tourists and sc...","Thousands of peole prayed, cheered, danced, be..."
2,Population experts say that little would chan...,If the two sides trying to force changes in th...
3,The unofficial tornado season runs from April...,Rumbling spring thunderstorms have announced t...
4,"William Gray, a hurricane expert, predicts mo...",A hurricane expert predicts a turbulent summer...


In [None]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

df.dropna(inplace=True)
vectorizer = TfidfVectorizer(use_idf=False)

sparse_matrix = vectorizer.fit_transform(df['content'])
lda = LatentDirichletAllocation(n_components=30, random_state=42)
topics = lda.fit_transform(sparse_matrix)


In [246]:
topics = lda.fit_transform(sparse_matrix)

In [None]:
summaries = []
count = 0
for df_i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Documents"):
    text = df.loc[df_i, 'content']
    sentences = sent_tokenize(text)
    doc_topic_distribution = topics[count]
    curr_sentence = []
    kls = []

    for sentence in sentences:
        curr_sentence.append(sentence)
        curr_sentence_joined = " ".join(curr_sentence)
        sparse_matrix = vectorizer.transform([curr_sentence_joined])
        sub_sen_topic_distribution = lda.transform(sparse_matrix)
        kl_score = np.sum(doc_topic_distribution * np.log(doc_topic_distribution/ sub_sen_topic_distribution))
        kls.append(kl_score)
    kls = np.array(kls)
    summaries.append(" ".join(sentences[0:np.argmin(kls) + 1]))
    count += 1

Processing Documents: 100%|██████████| 300/300 [00:01<00:00, 248.39it/s]


In [272]:
len(summaries)

300

In [None]:
from rouge_score import rouge_scorer


scorer = rouge_scorer.RougeScorer(['rouge1'])
rouge_1 = []
    
for expected, generated in zip(expected_summaries, summaries):
    scores = scorer.score(expected, generated)
    rouge_1.append(scores['rouge1'].fmeasure)

avg_rouge_1 = sum(rouge_1) / len(rouge_1)
    
print(f"ROUGE-1:",  avg_rouge_1)



ROUGE-1: 0.2521454000827042
