In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from gensim import parsing
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings('ignore')
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import NMF, LatentDirichletAllocation
import glob
from bs4 import BeautifulSoup

In [2]:
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from rouge import Rouge

nlp = spacy.load('en_core_web_sm')

def preprocess_string(text, filters):
    for f in filters:
        text = f(text)
    return text.split()

def strip_multiple_whitespaces(text):
    return re.sub(r'\s+', ' ', text)

def transform_to_lower(text):
    return text.lower()

def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOP_WORDS])

def remove_punctuations(text):
    return re.sub(r'[^\w\s]','',text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def cleaningPipe(document):
    processed_words = preprocess_string(document, [
        remove_emails,
        strip_multiple_whitespaces, 
        transform_to_lower,
        lemmatize_text
    ])
    
    return processed_words

def joinList(processed_words):
    return ' '.join(processed_words)


2023-04-03 03:24:34.757101: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 03:24:36.284768: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /shared/centos7/cuda/11.2/lib64:/shared/centos7/anaconda3/2022.05/lib:/shared/centos7/nodejs/14.15.4/lib:/home/patel.ayushj/.conda/envs/nlp-tf/lib/
2023-04-03 03:24:36.284885: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory;

In [3]:
path = 'DUC2001'

contents = []
summaries = []

data = { 'Article' : [] , 'Content' : [] , 'Summary' : [] }

for name in glob.glob(path + '/*'):
    
    filename  = os.path.basename(name)
    contents = ''
    summaries = ''

    try:
        if filename == 'annotations.txt' or filename in 'notes.txt':
            continue
            
        with open(path + '/Summaries/{}.txt'.format(filename.lower())) as file:
            f = file.read()
            abs = f.find('Abstract:')
            len_abs = len('Abstract:')
            intr = f.find('Introduction:')
            len_intr = len('Introduction:')
            
            summaries = f[(abs+len_abs):intr] 
            contents = f[(intr+len_intr):]
            
    except:
        continue
        
    data['Article'].append(filename)
    data['Summary'].append(summaries.strip().replace('\n', ' '))
    data['Content'].append(contents.strip().replace('\n', ' ').replace('    ', ' ').replace(' \x1a', ''))

In [4]:
DUC = pd.DataFrame(data)
DUC

Unnamed: 0,Article,Content,Summary
0,AP890314-0237,"Inside a small motor home, Joanne Pierluissi r...","San Antonio, Texas, with a 50% Hispanic popula..."
1,LA041889-0039,Out of the horn of Africa has emerged the most...,"A number of years ago, Ethiopian athletes came..."
2,FT923-5089,"THERE are growing signs that Hurricane Andrew,...","Hurricane Andrew, the costliest disaster to hi..."
3,AP880811-0299,An annual Agriculture Department survey confir...,President Reagan has signed a $3.9 billion dro...
4,AP900322-0192,A stone's throw from the smelly Smithfield mea...,The De Beers diamond cartel faces declining sa...
...,...,...,...
296,LA092189-0123,A French DC-10 jetliner with 171 people aboard...,A French DC-10 jetliner with 171 on board expl...
297,SJMN91-06193235,It's E-Day. But before you rush out to see whe...,"In San Jose, the eclipse will begin at 10:10 a..."
298,FT934-9116,THE FIGHT over the North American Free Trade A...,After Vice President Gore's debate victory ove...
299,AP900629-0260,It's been described as the largest current civ...,"The ""Chunnel"" between Britain and France is ha..."


In [5]:
DUC["Clean_Content"] = DUC["Content"].apply(cleaningPipe).apply(joinList)
DUC

Unnamed: 0,Article,Content,Summary,Clean_Content
0,AP890314-0237,"Inside a small motor home, Joanne Pierluissi r...","San Antonio, Texas, with a 50% Hispanic popula...","inside a small motor home , joanne pierluissi ..."
1,LA041889-0039,Out of the horn of Africa has emerged the most...,"A number of years ago, Ethiopian athletes came...",out of the horn of africa have emerge the most...
2,FT923-5089,"THERE are growing signs that Hurricane Andrew,...","Hurricane Andrew, the costliest disaster to hi...","there be grow sign that hurricane andrew , unw..."
3,AP880811-0299,An annual Agriculture Department survey confir...,President Reagan has signed a $3.9 billion dro...,an annual agriculture department survey confir...
4,AP900322-0192,A stone's throw from the smelly Smithfield mea...,The De Beers diamond cartel faces declining sa...,a stone 's throw from the smelly smithfield me...
...,...,...,...,...
296,LA092189-0123,A French DC-10 jetliner with 171 people aboard...,A French DC-10 jetliner with 171 on board expl...,a french dc-10 jetliner with 171 people aboard...
297,SJMN91-06193235,It's E-Day. But before you rush out to see whe...,"In San Jose, the eclipse will begin at 10:10 a...",it be e - day . but before you rush out to see...
298,FT934-9116,THE FIGHT over the North American Free Trade A...,After Vice President Gore's debate victory ove...,the fight over the north american free trade a...
299,AP900629-0260,It's been described as the largest current civ...,"The ""Chunnel"" between Britain and France is ha...",it be be describe as the large current civil e...


In [6]:
vectorizer = TfidfVectorizer(stop_words = "english", min_df = 2, max_df = 0.95)
duc_data = vectorizer.fit_transform(np.array(DUC["Clean_Content"]))

X_train = pd.DataFrame(duc_data.toarray())

num_points = 1000

X_train = X_train.iloc[:num_points,:]

In [7]:
# Create an LDA object with K = 10 topics
lda_10 = LatentDirichletAllocation(n_components = 10, random_state = 42, learning_method = 'online', n_jobs = -1)

# Fit the LDA model to the document-term matrix
lda_10.fit(X_train)

# Create an LDA object with K = 20 topics
lda_20 = LatentDirichletAllocation(n_components = 20, random_state = 42, learning_method = 'online', n_jobs = -1)

# Fit the LDA model to the document-term matrix
lda_20.fit(X_train)

# Create an LDA object with K = 50 topics
lda_50 = LatentDirichletAllocation(n_components = 50, random_state = 42, learning_method = 'online', n_jobs = -1)

# Fit the LDA model to the document-term matrix
lda_50.fit(X_train)

LatentDirichletAllocation(learning_method='online', n_components=50, n_jobs=-1,
                          random_state=42)

In [8]:
# Print the top 20 words and their probabilities for each topic in the lda_10 model
for i, topic in enumerate(lda_10.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
official (0.00017)
mile (0.00017)
guard (0.00017)
march (0.00017)
coast (0.00017)
barrel (0.00017)
company (0.00017)
sea (0.00017)
ship (0.00017)
sound (0.00017)
prince (0.00017)
wildlife (0.00018)
tanker (0.00019)
cleanup (0.00019)
say (0.00019)
alaska (0.0002)
valdez (0.00026)
spill (0.00029)
exxon (0.00029)
oil (0.00033)


TOPIC 1:
digging (0.00016)
beach (0.00016)
park (0.00016)
marathon (0.00016)
wednesday (0.00016)
sgt (0.00016)
hispanic (0.00016)
slovenia (0.00016)
nafta (0.00017)
tablerow (0.00017)
mr (0.00017)
jackson (0.00017)
rowrule (0.00017)
crash (0.00017)
police (0.00017)
say (0.00017)
chj (0.00019)
cvj (0.00019)
tablecell (0.00024)
cellrule (0.00026)


TOPIC 2:
new (0.00126)
taylor (0.00128)
term (0.00129)
official (0.00129)
storm (0.00129)
gun (0.00132)
mile (0.00133)
oil (0.00135)
diamond (0.00145)
crash (0.00148)
slovenia (0.00149)
tunnel (0.00154)
people (0.00156)
eclipse (0.00158)
state (0.00158)
police (0.00188)
mr (0.00191)
year (0.00221)
hurricane (0.00

In [9]:
# Print the top 20 words and their probabilities for each topic in the lda_20 model
for i, topic in enumerate(lda_20.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
planning (0.00016)
individual (0.00016)
official (0.00016)
mexico (0.00016)
northwestern (0.00016)
frame (0.00016)
repeatedly (0.00016)
knight (0.00016)
steady (0.00016)
osman (0.00016)
immune (0.00016)
hahn (0.00016)
colosio (0.00016)
permit (0.00016)
funnel (0.00016)
geyser (0.00016)
dole (0.00016)
march (0.00017)
1993 (0.00017)
corazon (0.00017)


TOPIC 1:
700 (0.00017)
pilot (0.00017)
poverty (0.00017)
engine (0.00017)
leslie (0.00017)
gates (0.00017)
acre (0.00017)
plane (0.00017)
air (0.00017)
mr (0.00017)
digging (0.00017)
diamond (0.00017)
family (0.00017)
year (0.00017)
slovenia (0.00017)
say (0.00018)
marathon (0.00018)
reform (0.0002)
crash (0.00021)
welfare (0.00027)


TOPIC 2:
shadow (0.00017)
tablerow (0.00017)
baja (0.00017)
flight (0.00017)
hawaii (0.00017)
sky (0.00018)
rowrule (0.00018)
park (0.00018)
glass (0.00018)
say (0.00018)
earthquake (0.00018)
solar (0.00019)
telescope (0.00019)
cvj (0.00019)
moon (0.00019)
chj (0.00019)
sun (0.00023)
tablecell (0.000

In [10]:
# Print the top 20 words and their probabilities for each topic in the lda_50 model
for i, topic in enumerate(lda_50.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
funnel (0.00017)
individual (0.00017)
geyser (0.00017)
doctor (0.00017)
secondary (0.00017)
tuberculosis (0.00017)
permit (0.00017)
dole (0.00018)
bacterial (0.00018)
la (0.00018)
viral (0.00018)
say (0.00018)
disease (0.00018)
march (0.00018)
1993 (0.00018)
corazon (0.00018)
health (0.00018)
pneumonia (0.00019)
colosio (0.0002)
taylor (0.0002)


TOPIC 1:
distinction (0.00017)
edge (0.00017)
merrill (0.00017)
moist (0.00017)
conclude (0.00017)
marlin (0.00017)
jacques (0.00017)
wednesday (0.00017)
hayes (0.00017)
minister (0.00017)
embrace (0.00017)
surprise (0.00017)
vigorously (0.00017)
allergy (0.00017)
alyeska (0.00017)
tech (0.00017)
700 (0.00018)
optimistic (0.00018)
leslie (0.00018)
digging (0.00019)


TOPIC 2:
broadly (0.00017)
virtually (0.00017)
wrong (0.00017)
furor (0.00017)
fishery (0.00017)
odd (0.00017)
dc (0.00017)
paratrooper (0.00017)
opportunity (0.00017)
assist (0.00017)
cordial (0.00017)
poultry (0.00017)
easternmost (0.00017)
treasury (0.00017)
contrast (

# NMF

In [11]:
# Create an NMF object with K = 10 topics
nmf_10 = NMF(n_components = 10, random_state = 42)

# Fit the NMF model to the document-term matrix
nmf_10.fit(X_train)

# Create an NMF object with K = 20 topics
nmf_20 = NMF(n_components = 20, random_state = 42)

# Fit the NMF model to the document-term matrix
nmf_20.fit(X_train)

# Create an NMF object with K = 50 topics
nmf_50 = NMF(n_components = 50, random_state = 42)

# Fit the NMF model to the document-term matrix
nmf_50.fit(X_train)

NMF(n_components=50, random_state=42)

In [12]:
# Print the top 20 words and their probabilities for each topic in the nmf_10 model
for i, topic in enumerate(nmf_10.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
gun (0.0034)
vote (0.0036)
court (0.00392)
say (0.00394)
amendment (0.00394)
right (0.00414)
reform (0.00447)
illegal (0.00476)
congress (0.00485)
mr (0.00525)
house (0.0053)
state (0.00544)
alien (0.00574)
count (0.00579)
term (0.00631)
thomas (0.00659)
limit (0.00664)
census (0.00749)
nafta (0.00777)
welfare (0.01047)


TOPIC 1:
louisiana (0.00421)
damage (0.00425)
coast (0.00441)
satellite (0.0045)
miami (0.00505)
center (0.00516)
forecaster (0.00653)
season (0.0066)
hugo (0.00681)
gray (0.00697)
gilbert (0.00732)
say (0.00745)
florida (0.00748)
tropical (0.00788)
mph (0.00828)
atlantic (0.00875)
wind (0.00888)
sheet (0.00999)
storm (0.01779)
hurricane (0.05268)


TOPIC 2:
prince (0.00419)
company (0.00435)
reef (0.0044)
beach (0.00441)
guard (0.00445)
hazelwood (0.00448)
coast (0.00453)
gallon (0.00458)
million (0.00466)
mile (0.00504)
sound (0.00543)
ship (0.00583)
tanker (0.0082)
say (0.00839)
alaska (0.00931)
cleanup (0.00999)
valdez (0.01924)
spill (0.02508)
exxon (0.0

In [13]:
# Print the top 20 words and their probabilities for each topic in the nmf_20 model
for i, topic in enumerate(nmf_20.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
mph (0.00451)
mile (0.00457)
texas (0.00477)
car (0.00481)
county (0.00493)
damage (0.00505)
shelter (0.00533)
death (0.00546)
people (0.00556)
record (0.00557)
thunderstorm (0.00566)
injure (0.0058)
say (0.00584)
path (0.00647)
home (0.0068)
average (0.00689)
storm (0.00737)
weather (0.00778)
twister (0.012)
tornado (0.0632)


TOPIC 1:
louisiana (0.00425)
sahel (0.0043)
coast (0.00461)
satellite (0.00474)
center (0.00525)
miami (0.00532)
forecaster (0.00693)
season (0.00702)
hugo (0.00724)
gray (0.00748)
say (0.0076)
florida (0.00769)
gilbert (0.00784)
tropical (0.0084)
mph (0.0085)
wind (0.0093)
atlantic (0.00937)
sheet (0.01073)
storm (0.01833)
hurricane (0.05616)


TOPIC 2:
prince (0.00434)
company (0.00447)
reef (0.00457)
beach (0.00459)
guard (0.00462)
hazelwood (0.00465)
coast (0.00469)
gallon (0.00475)
million (0.00477)
mile (0.00503)
sound (0.00563)
ship (0.00605)
tanker (0.00851)
say (0.00856)
alaska (0.00965)
cleanup (0.01037)
valdez (0.01998)
spill (0.02604)
exxon 

In [14]:
# Print the top 20 words and their probabilities for each topic in the nmf_50 model
for i, topic in enumerate(nmf_50.components_):
    print(f"TOPIC {i}:")
    top_words = [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-20:]]
    top_probs = [round((topic[index] / sum(topic)), 5) for index in topic.argsort()[-20:]]
    for word, prob in zip(top_words, top_probs):
        print(f"{word} ({prob})")
    print("\n")

TOPIC 0:
saturday (0.00504)
wind (0.00511)
line (0.00561)
wyoming (0.00566)
monday (0.00604)
service (0.0062)
lightning (0.00625)
utah (0.00626)
crew (0.00704)
wilderness (0.00746)
rain (0.00752)
000 (0.00808)
contain (0.00852)
national (0.00985)
say (0.01107)
blaze (0.01349)
burn (0.01517)
firefighter (0.02096)
forest (0.03247)
acre (0.0345)


TOPIC 1:
coast (0.00512)
predict (0.00513)
caribbean (0.00522)
category (0.00533)
sahel (0.00541)
satellite (0.00607)
center (0.0064)
hugo (0.0083)
say (0.00861)
season (0.00881)
forecaster (0.00914)
gray (0.00957)
gilbert (0.0104)
tropical (0.0105)
mph (0.01088)
wind (0.01162)
atlantic (0.0121)
sheet (0.01428)
storm (0.02115)
hurricane (0.06724)


TOPIC 2:
company (0.00462)
prince (0.00464)
beach (0.00473)
million (0.00487)
guard (0.00489)
reef (0.0049)
coast (0.00492)
hazelwood (0.00497)
mile (0.00501)
gallon (0.00506)
sound (0.00599)
ship (0.0064)
say (0.00861)
tanker (0.00912)
alaska (0.01033)
cleanup (0.01109)
valdez (0.02138)
spill (0.0278