In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
import warnings 
warnings.filterwarnings('ignore')
import nltk
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from gensim import parsing
from sklearn.datasets import fetch_20newsgroups
import re
import math
import glob
import os

In [2]:
import nltk
nltk.download('punkt')

def kl_summarize (doc_data, num_of_sentences):
  summaries = []
  for document, file_name in doc_data:
    vectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = 'english')
    document_model = vectorizer.fit_transform([document])
    summary = []
    picked_sentences = set()
    for _ in range(num_of_sentences):
      this_sentence = ''
      this_sentence_score = float('-inf')
      this_sentence_index = 0
      for sentence in nltk.sent_tokenize(document):
        # print('sampling sentence: ', sentence, '\n')
        if this_sentence_index in picked_sentences: continue
        new_sentences = list(map(lambda x:x[0], summary))
        new_sentences.append(sentence)
        kl_score = kl_similarity(document_model.T.toarray(), vectorizer.transform([' '.join(new_sentences)]).T.toarray())
        if kl_score > this_sentence_score:
          this_sentence_score = kl_score
          this_sentence = (sentence, this_sentence_index)
        this_sentence_index += 1
      if this_sentence != '':
        summary.append(this_sentence)
        picked_sentences.add(this_sentence[1])

    summary = sorted(summary, key = lambda x: x[1]) 
    summaries.append(' '.join(list(map(lambda x: x[0], summary))))
  return summaries

[nltk_data] Downloading package punkt to
[nltk_data]     /home/patel.ayushj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import math
def kl_similarity(p, q) :
  kl = 0 
  lambda_param = 0.1
  for i in range(p.shape[0]): 
    p_i = p[i]
    q_i = q[i]
    kl += p_i * math.log( (p_i + lambda_param) / (q_i + ( lambda_param * p_i.shape[0] ) ) )
  return kl

In [4]:
ng_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).data
ng_data = ng_data[27:29]

summaries = kl_summarize(zip(ng_data,ng_data), 2)
for i, summary in enumerate(summaries):
    print(f'Summary {i+1}: {summary}\n')

Summary 1: Using the VMODE command, all you need to do is type VMODE VESA at the dos
prompt. VMODE is included with the Speedstar 24.

Summary 2: Everything is less than a year old. Thanks.



In [5]:
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from rouge import Rouge

nlp = spacy.load('en_core_web_sm')

def preprocess_string(text, filters):
    for f in filters:
        text = f(text)
    return text.split()

def strip_multiple_whitespaces(text):
    return re.sub(r'\s+', ' ', text)

def transform_to_lower(text):
    return text.lower()

def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOP_WORDS])

def remove_punctuations(text):
    return re.sub(r'[^\w\s]','',text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def cleaningPipe(document):
    processed_words = preprocess_string(document, [
        remove_emails,
        strip_multiple_whitespaces, 
        transform_to_lower,
        lemmatize_text
    ])
    
    return processed_words

def joinList(processed_words):
    return ' '.join(processed_words)


2023-04-03 03:20:30.043724: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 03:20:31.844472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /shared/centos7/cuda/11.2/lib64:/shared/centos7/anaconda3/2022.05/lib:/shared/centos7/nodejs/14.15.4/lib:/home/patel.ayushj/.conda/envs/nlp-tf/lib/
2023-04-03 03:20:31.844653: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory;

In [6]:
path = 'DUC2001'

contents = []
summaries = []

data = { 'Article' : [] , 'Content' : [] , 'Summary' : [] }

for name in glob.glob(path + '/*'):
    
    filename  = os.path.basename(name)
    contents = ''
    summaries = ''

    try:
        if filename == 'annotations.txt' or filename in 'notes.txt':
            continue
            
        with open(path + '/Summaries/{}.txt'.format(filename.lower())) as file:
            f = file.read()
            abs = f.find('Abstract:')
            len_abs = len('Abstract:')
            intr = f.find('Introduction:')
            len_intr = len('Introduction:')
            
            summaries = f[(abs+len_abs):intr] 
            contents = f[(intr+len_intr):]
            
    except:
        continue
        
    data['Article'].append(filename)
    data['Summary'].append(summaries.strip().replace('\n', ' '))
    data['Content'].append(contents.strip().replace('\n', ' ').replace('    ', ' ').replace(' \x1a', ''))

In [7]:
ducDF = pd.DataFrame(data)

In [8]:
ducDF

Unnamed: 0,Article,Content,Summary
0,AP890314-0237,"Inside a small motor home, Joanne Pierluissi r...","San Antonio, Texas, with a 50% Hispanic popula..."
1,LA041889-0039,Out of the horn of Africa has emerged the most...,"A number of years ago, Ethiopian athletes came..."
2,FT923-5089,"THERE are growing signs that Hurricane Andrew,...","Hurricane Andrew, the costliest disaster to hi..."
3,AP880811-0299,An annual Agriculture Department survey confir...,President Reagan has signed a $3.9 billion dro...
4,AP900322-0192,A stone's throw from the smelly Smithfield mea...,The De Beers diamond cartel faces declining sa...
...,...,...,...
296,LA092189-0123,A French DC-10 jetliner with 171 people aboard...,A French DC-10 jetliner with 171 on board expl...
297,SJMN91-06193235,It's E-Day. But before you rush out to see whe...,"In San Jose, the eclipse will begin at 10:10 a..."
298,FT934-9116,THE FIGHT over the North American Free Trade A...,After Vice President Gore's debate victory ove...
299,AP900629-0260,It's been described as the largest current civ...,"The ""Chunnel"" between Britain and France is ha..."


In [9]:
ducDF["cleaned_articles"] = ducDF["Content"].apply(cleaningPipe).apply(joinList)

In [10]:
DUC_data = ducDF['cleaned_articles']

In [11]:
DUC_data[0]

"inside a small motor home , joanne pierluissi raise her sleeve as nurse mary perez insert a needle into the vein above her forearm , draw blood into a tube for a diabetes test . as her daughter watch , pierluissi , 24 , say it be for they , as much as for herself , that she agree to be test for the deadly killer of hispanic . twelve million americans have some form of diabetes , but it most prevalent among minority , especially native americans , black and hispanic . hispanic be three time as likely to develop diabetes as the general population , and 40 percent of the 700,000 victim in texas be mexican - american . more than 150,000 americans die from diabete each year ; another 150,000 death be diabetes - relate , accord to the american diabetes association . no one really know what spark it , but researcher believe hispanic could hold the key . san antonio , the nation 's ninth large city , with a population that be 50 percent hispanic , be become the base for diabetes study . san a

In [12]:
ducDF["Summary"][0]

"San Antonio, Texas, with a 50% Hispanic population, has become a center for the study of diabetes in Hispanics. Dr. Ralph DeFronzo left his diabetes research center at Yale University to relocate to the University of Texas at San Antonio to study the local population. His researchers are reaching out in a mobile home which roams the neighborhoods and tests inhabitants for the disease. Dr. DeFronzo's researchers believe that poor Hispanics' diet of cheap processed foods, their lack of exercise and infrequent medical attention are the root causes of their predisposition. They believe education of children may be critical."

In [13]:
kl_summaries = kl_summarize(zip(DUC_data[100:101], DUC_data[100:101]) , 8)

for i in kl_summaries:
  print (i, '\n\n')

cloudy weather saturday threaten to mar the show for thousand of finnish and foreign skygazer hope to glimpse a total solar eclipse in this land of the midnight sun . the solar eclipse in finland start at 4:03 a.m. sunday ( 9:03 p.m. edt saturday ) . at that time , the moon will begin gradually move between the earth and the sun . the total eclipse begin at 4:52 a.m. in helsinki and will last 83 second . the eclipse end at 5:45 a.m. in helsinki . some eclipse viewer will not have to worry about the cloud , because they will be above they . 




In [14]:
from rouge import Rouge

rouge = Rouge()

references = [ducDF['Summary'][100]] 

scores = rouge.get_scores(kl_summaries, references, avg = True)

print(scores)

{'rouge-1': {'f': 0.350710895474046, 'p': 0.3523809523809524, 'r': 0.3490566037735849}, 'rouge-2': {'f': 0.10526315289485155, 'p': 0.10576923076923077, 'r': 0.10476190476190476}, 'rouge-l': {'f': 0.2535211217962707, 'p': 0.27692307692307694, 'r': 0.23376623376623376}}


<b>Observation: </b> <br>
As we can see that the model performs well for unigram that is the rouge-1 is 0.35. But when it comes to bigram model (rouge-2) we have a lower f1-score, precision and recall. Thus we can infer that the KL_summarizer is only good for unigram models

# Section 2

In [15]:
def lda_summarize(doc_data, num_of_sentences, num_of_topics=1000, num_of_top_words=20):
  summaries = []
  for document in doc_data:
    # LDA model
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    document_model = vectorizer.fit_transform([document])
    
    lda = LatentDirichletAllocation(n_components=num_of_topics, max_iter=20, random_state=42)
    lda.fit(document_model)

    # Get the most probable words for each topic
    feature_names = vectorizer.get_feature_names()
    topic_words = []
    for topic in lda.components_:
        word_idx = np.argsort(topic)[::-1][:num_of_top_words]
        topic_words.append([feature_names[i] for i in word_idx])

    # Get the topic distribution for each sentence
    sentence_topics = []
    for sentence in nltk.sent_tokenize(document):
        sentence_model = vectorizer.transform([sentence])
        sentence_topics.append(lda.transform(sentence_model)[0])

    # Pick the top sentences based on the topic diversity
    summary = []
    picked_sentences = set()
    while len(summary) < num_of_sentences:
        best_sentence = None
        best_score = 0
        for i, sentence in enumerate(sentence_topics):
            if i in picked_sentences: continue
            sentence_score = sum([sentence[j]*sentence_topics[j][k] for j in range(len(sentence_topics)) for k in range(num_of_topics)])
            if sentence_score > best_score:
                best_sentence = i
                best_score = sentence_score
        if best_sentence is None: break
        summary.append((nltk.sent_tokenize(document)[best_sentence], best_sentence))
        picked_sentences.add(best_sentence)

    summary = sorted(summary, key=lambda x: x[1])
    summaries.append(' '.join(list(map(lambda x: x[0], summary))))
  return summaries


In [16]:
ng_data = fetch_20newsgroups(subset = 'all', remove = ('headers', 'footers', 'quotes')).data
ng_data = ng_data[27:28]

summaries = lda_summarize(ng_data, 2)
for i, summary in enumerate(summaries):
    print(f'Summary {i+1}: {summary}\n')

Summary 1: Using the VMODE command, all you need to do is type VMODE VESA at the dos
prompt. VMODE is included with the Speedstar 24.



In [36]:
summaries = lda_summarize(DUC_data[100:101], 8)
for i, summary in enumerate(summaries):
    print(f'Summary {i+1}: {summary}\n')

Summary 1: cloudy weather saturday threaten to mar the show for thousand of finnish and foreign skygazer hope to glimpse a total solar eclipse in this land of the midnight sun . the weather forecast take a turn for the bad in the evening , when the finnish meteorological service predict cloudy weather with a chance of shower for eastern finland on sunday . in the eastern town of joensuu , a television news broadcast late saturday show it be already cloudy there with a light drizzle fall . the solar eclipse in finland start at 4:03 a.m. sunday ( 9:03 p.m. edt saturday ) . at that time , the moon will begin gradually move between the earth and the sun . the total eclipse begin at 4:52 a.m. in helsinki and will last 83 second . after the total phase of the eclipse , the moon will move away , uncover more and more of the sun . the eclipse end at 5:45 a.m. in helsinki .



In [37]:
ducDF['Summary'][100]

'A total eclipse will arc from southeast Finland, across northeastern U.S.S.R to the Alaskan Aleutian Islands. The best place for observation will be in the Soviet Union. In Helsinki, the total eclipse begins at 4:52 a.m. and will last 83 seconds. Best viewing in Finland will be in Joensuu, where about 10,000 people, including 3000 foreigners are expected to converge. In Joensuu the sun will be 5 degrees above the horizon, compared with 1 degree above in Helsinki. Skies may be cloudy. Some viewers will be above the clouds. Finnair, the national airline, and some private companies are offering eclipse viewing flights.'

In [38]:
from rouge import Rouge

rouge = Rouge()

references = [ducDF['Summary'][100]] 

scores = rouge.get_scores(summaries, references, avg = True)

print(scores)

{'rouge-1': {'f': 0.2527880993137188, 'p': 0.2085889570552147, 'r': 0.32075471698113206}, 'rouge-2': {'f': 0.07490636226907407, 'p': 0.06172839506172839, 'r': 0.09523809523809523}, 'rouge-l': {'f': 0.20858895207045816, 'p': 0.19767441860465115, 'r': 0.22077922077922077}}
