**Convention:** Code cells using referenced code begin with a comment `# REFERRED`. Code cells containing code I wrote begin with a comment `# MY CODE`. In cells that have both referenced and original code, the respective code parts are labelled.


# Load DATASET

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json

dataset_dir = '/content/drive/MyDrive/DATASET'
files = []
for (root,dirs,file) in os.walk(dataset_dir):
  files = file

json_files = []
text_files = []

for file in files:
  if '.json' in file:
    json_files.append(file)
  else:
    text_files.append(file)

print('json files:', len(json_files))
print('text files:', len(text_files))

json files: 48
text files: 48


# Pre-processing

In [3]:
import numpy as np
import pandas as pd

*Reference: https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf*

In [4]:
# REFERRED

## defining all utilty functions - needed for Data cleaning and processing

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
import nltk

# Contraction map
c_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you you will",
    "you'll've": "you you will have",
    "you're": "you are",
    "you've": "you have"
}

# Compiling the contraction dict
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

# List of stop words
add_stop = ['said', 'say', '...', 'like', 'cnn', 'ad']
stop_words = ENGLISH_STOP_WORDS.union(add_stop)

# List of punctuation
punc = list(set(string.punctuation))


# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)


def process_text(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text


def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words - 1:-1]


def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    return pd.DataFrame(topics)


def whitespace_tokenizer(text):
    pattern = r"(?u)\b\w\w+\b"
    tokenizer_regex = RegexpTokenizer(pattern)
    tokens = tokenizer_regex.tokenize(text)
    return tokens


# Funtion to remove duplicate words
def unique_words(text):
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    return ulist


def word_count(text):
    return len(str(text).split(' '))

In [5]:
# REFERRED

# Removing stemming step as it is not required for evaluating BERTopic

def process_text2(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    # text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text

In [6]:
# REFERRED

# Removing punctuations for evaluating noun phrase approach

def process_text3(text):
    text = casual_tokenizer(text)
    # text = [each.lower() for each in text]
    # text = [re.sub('[0-9]+', '', each) for each in text]
    # text = [expandContractions(each, c_re=c_re) for each in text]
    # text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    # text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text

# Noun-Phrase Approach

In [7]:
import spacy

# MY CODE
def get_noun_phrases(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  noun_phrases = [chunk.text for chunk in doc.noun_chunks if chunk.root.pos_ != 'PRON']
  return noun_phrases

In [8]:
def tm_noun_phase(filename):
  # MY CODE
  with open(dataset_dir+'/'+filename,'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  topic_list_noun_phrase = []
  for comment in comments:
    noun_phrases = get_noun_phrases(comment)
    topic_list_noun_phrase.append(' '.join(noun_phrases))

  with open(f'[NP_TOPICS_LIST]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(topic_list_noun_phrase,indent=4))

In [9]:
# MY CODE
# for file in json_files:
#   try:
#     tm_noun_phase(file)
#   except:
#     print(file)
#     continue

# Non-negative Matrix Factorization (NMF)

*Reference: https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf*

In [10]:
# !pip install gensim

In [67]:
from gensim.corpora import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel
from operator import itemgetter

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

def tm_nmf(filename):
  #------------
  # MY CODE
  with open(dataset_dir+'/'+filename,'r') as dataset:
    data = json.load(dataset)

  processed_comments = list(map(process_text,data['comments']))

  #-------------

  # REFERRED
  # Use Gensim's NMF to get the best num of topics via coherence score
  texts = processed_comments

  # Create a dictionary
  # In gensim a dictionary is a mapping between words and their integer id
  dictionary = Dictionary(texts)

  # Filter out extremes to limit the number of features
  dictionary.filter_extremes(
      no_below=3,
      no_above=0.85,
      keep_n=5000
  )

  # Create the bag-of-words format (list of (token_id, token_count))
  corpus = [dictionary.doc2bow(text) for text in texts]

  # Create a list of the topic numbers we want to try
  topic_nums = list(np.arange(5, 75 + 1, 5))

  # Run the nmf model and calculate the coherence score
  # for each number of topics
  coherence_scores = []

  for num in topic_nums:
      nmf = Nmf(
          corpus=corpus,
          num_topics=num,
          id2word=dictionary,
          chunksize=2000,
          passes=5,
          kappa=.1,
          minimum_probability=0.01,
          w_max_iter=300,
          w_stop_condition=0.0001,
          h_max_iter=100,
          h_stop_condition=0.001,
          eval_every=10,
          normalize=True,
          random_state=42
      )

      # Run the coherence model to get the score
      cm = CoherenceModel(
          model=nmf,
          texts=texts,
          dictionary=dictionary,
          coherence='c_npmi'    # ------------CHANGED-----------
      )

      coherence_scores.append(round(cm.get_coherence(), 5))

  # Get the number of topics with the highest coherence score
  scores = list(zip(topic_nums, coherence_scores))
  best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

  # Now use the number of topics with the
  # highest coherence score to run the
  # sklearn nmf model

  texts = processed_comments

  # Create the tfidf weights
  tfidf_vectorizer = TfidfVectorizer(
      min_df=3,
      max_df=0.85,
      max_features=5000,
      ngram_range=(1, 2),
      preprocessor=' '.join
  )

  tfidf = tfidf_vectorizer.fit_transform(texts)

  # Save the feature names for later to create topic summaries
  tfidf_fn = tfidf_vectorizer.get_feature_names_out()

  # Run the nmf model
  nmf = NMF(
      n_components=best_num_topics,
      init='nndsvd',
      max_iter=500,
      l1_ratio=0.0,
      solver='cd',
      alpha_W=0.0,
      tol=1e-4,
      random_state=42
  ).fit(tfidf)

  # Use the top words for each cluster by tfidf weight
  # to create 'topics'

  # Getting a df with each topic by document
  docweights = nmf.transform(tfidf_vectorizer.transform(texts))

  n_top_words = 8

  topic_df = topic_table(
      nmf,
      tfidf_fn,
      n_top_words
  ).T

  # Cleaning up the top words to create topic summaries
  topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)], axis=1) # Joining each word into a list
  topic_df['topics'] = topic_df['topics'].str[0]  # Removing the list brackets
  topic_df['topics'] = topic_df['topics'].apply(lambda x: whitespace_tokenizer(x)) # tokenize
  topic_df['topics'] = topic_df['topics'].apply(lambda x: unique_words(x))  # Removing duplicate words
  topic_df['topics'] = topic_df['topics'].apply(lambda x: [' '.join(x)])  # Joining each word into a list
  topic_df['topics'] = topic_df['topics'].str[0]  # Removing the list brackets

  # Create a df with only the created topics and topic num
  topic_df = topic_df['topics'].reset_index()
  topic_df.columns = ['topic_num', 'topics']
  #-----------------

  # MY CODE
  topics_list_nmf = list(topic_df['topics'])

  with open(f'[NMF_TOPICS_LIST]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(topics_list_nmf,indent=4))

  with open(f'[NMF_COMMENTS_TOPICS]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(docweights.argmax(axis=1).tolist(),indent=4))

In [68]:
# MY CODE
# for file in json_files:
#   try:
#     tm_nmf(file)
#   except:
#     print(file)
#     continue

When There’s a Dearth of Good Information on Women’s Health, a Million Scams Bloom.json


# BERTopic

*Reference: https://maartengr.github.io/BERTopic/index.html#quick-start*

In [8]:
# !pip install bertopic

In [9]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

def tm_bertopic(filename):
  #------------
  # MY CODE
  with open(dataset_dir+'/'+filename,'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  #-------------

  # REFERRED
  # Fine-tune your topic representations
  representation_model = KeyBERTInspired()
  topic_model = BERTopic(representation_model=representation_model)

  topics, probs = topic_model.fit_transform(comments)
  #-------------

  # MY CODE
  topics_dict = topic_model.get_topics()
  topics_list_bertopic = []
  for id,tups in topics_dict.items():
    tw = [a[0] for a in tups]
    topics_list_bertopic.append(' '.join(tw))

  assigned_topics = topic_model.get_document_info(comments)["Topic"].tolist()

  with open(f'[BERTOPIC_TOPICS_LIST]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(topics_list_bertopic,indent=4))

  with open(f'[BERTOPIC_COMMENTS_TOPICS]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(assigned_topics,indent=4))

  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [10]:
# MY CODE
# for file in json_files:
#   try:
#     tm_bertopic(file)
#   except:
#     print(file)
#     continue

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(


When There’s a Dearth of Good Information on Women’s Health, a Million Scams Bloom.json


# Evaluation

In [11]:
# MY CODE
from nltk import bigrams
import math

def npmi(w1,w2,corpus,preprocess_function=None,corpus_as_list=False):
  if preprocess_function:
    corpus_new = preprocess_function(corpus)
  elif corpus_as_list:
    corpus_new = corpus
  else:
    corpus_new = corpus.split(' ')

  all_bigrams = list(bigrams(corpus_new))

  total_bigram_count = len(all_bigrams)
  joint_count_w1_w2 = len([(a,b) for (a,b) in all_bigrams if a == w1 and b == w2])
  joint_count_w2_w1 = len([(a,b) for (a,b) in all_bigrams if b == w1 and a == w2])

  p_i_j = (joint_count_w1_w2/total_bigram_count) + (joint_count_w2_w1/total_bigram_count)
  p_i_star = len([(a,b) for (a,b) in all_bigrams if a == w1])/total_bigram_count
  p_star_j = len([(a,b) for (a,b) in all_bigrams if b == w2])/total_bigram_count

  if p_i_j == 0 or p_i_star == 0 or p_star_j == 0:
    pmi_w1_w2 = 0
    npmi_w1_w2 = 0
    return npmi_w1_w2

  pmi_w1_w2 = math.log(p_i_j/(p_i_star*p_star_j))

  npmi_w1_w2 = pmi_w1_w2/-math.log(p_i_j)

  return npmi_w1_w2

In [12]:
# MY CODE
from nltk import bigrams
from itertools import combinations

def npmi_topic(topic,corpus,preprocess_function=None,method='sum',corpus_as_list=False):
  if preprocess_function:
    corpus_new = preprocess_function(corpus)
  elif corpus_as_list:
    corpus_new = corpus
  else:
    corpus_new = corpus.split(' ')

  all_bigrams = list(bigrams(corpus_new))
  total_bigram_count = len(all_bigrams)

  topic_words = topic.split(' ')
  topic_pairs = list(combinations(topic_words,2))

  npmi_list = []
  for topic_pair in topic_pairs:
    npmi_tp = npmi(w1=topic_pair[0], w2= topic_pair[1],corpus=corpus_new,corpus_as_list=True)
    npmi_list.append(npmi_tp)

  if method == 'sum':
    return sum(npmi_list)
  elif method == 'avg':
    return sum(npmi_list)/len(npmi_list)
  else:
    return "method can only be 'sum' or 'avg'"

In [13]:
# MY CODE
from itertools import combinations

def npmi_from_bigrams(topic,all_bigrams,method='sum'):
  total_bigram_count = len(all_bigrams)

  topic_words = topic.split(' ')
  topic_pairs = list(combinations(topic_words,2))

  npmi_list = []
  for (w1,w2) in topic_pairs:
    joint_count_w1_w2 = len([(a,b) for (a,b) in all_bigrams if a == w1 and b == w2])
    joint_count_w2_w1 = len([(a,b) for (a,b) in all_bigrams if b == w1 and a == w2])
    p_i_j = (joint_count_w1_w2/total_bigram_count) + (joint_count_w2_w1/total_bigram_count)
    p_i_star = len([(a,b) for (a,b) in all_bigrams if a == w1])/total_bigram_count
    p_star_j = len([(a,b) for (a,b) in all_bigrams if b == w2])/total_bigram_count

    if p_i_j == 0 or p_i_star == 0 or p_star_j == 0:
      pmi_w1_w2 = 0
      npmi_w1_w2 = 0
      npmi_list.append(npmi_w1_w2)
      continue

    pmi_w1_w2 = math.log(p_i_j/(p_i_star*p_star_j))

    npmi_w1_w2 = pmi_w1_w2/-math.log(p_i_j)
    npmi_list.append(npmi_w1_w2)

  if method == 'sum':
    return sum(npmi_list)
  elif method == 'avg':
    return sum(npmi_list)/len(npmi_list)
  else:
    raise Exception("method can only be 'sum' or 'avg'")

In [27]:
# MY CODE
import json
import time
from nltk import bigrams

def eval_bertopic(text_filename):
  bertopic_topics_dir = '/content/drive/MyDrive/BERTOPIC_TOPICS'
  bertopic_assigned_topics_dir = '/content/drive/MyDrive/BERTOPIC_COMMENTS_TOPICS'
  with open(bertopic_topics_dir+'/[BERTOPIC_TOPICS_LIST]'+text_filename.replace('txt','json'),'r') as dataset:
    topics = json.load(dataset)

  with open(bertopic_assigned_topics_dir+'/[BERTOPIC_COMMENTS_TOPICS]'+text_filename.replace('txt','json'),'r') as dataset:
    assigned_topics = json.load(dataset)


  article = ''
  with open(dataset_dir+'/'+text_filename,'r') as file:
    article = file.read()

  with open(dataset_dir+'/'+text_filename.replace('txt','json'),'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  processed_article = process_text2(article)
  all_bigrams = list(bigrams(processed_article))

  len_ratios_tc = []
  latency_ratios = []
  for at,comment in zip(assigned_topics,comments):
    #
    if at == -1:
      continue

    # Size Reduction eval
    comment_words_len = len(comment.split(' '))
    topic_words_len = len(topics[at].split(' '))
    len_ratios_tc.append(topic_words_len/comment_words_len)

    # Latency eval
    comment_words = comment.split(' ')
    topic_words = topics[at].split(' ')
    tw_appear = 0
    for topic_word in topic_words:
      tw_appear += int(topic_word in comment_words)
    latency_ratio = tw_appear/topic_words_len
    latency_ratios.append(latency_ratio)

  # Relatedness to article eval using npmi
  npmi_list =[]
  for topic in topics:
    npmi_t = npmi_from_bigrams(topic=topic,all_bigrams=all_bigrams)
    npmi_list.append(npmi_t)

  # Taking average for the eval measures
  if len(len_ratios_tc) == 0:
    len_ratio_avg = -100
  else:
    len_ratio_avg = sum(len_ratios_tc)/len(len_ratios_tc)

  if len(latency_ratios) == 0:
    latency_ratio_avg = -100
  else:
    latency_ratio_avg = sum(latency_ratios)/len(latency_ratios)
  if len(npmi_list):
    npmi_avg = sum(npmi_list)/len(npmi_list)
  else:
    npmi_avg = 0

  # Store results in a dictionary
  results = {
      "len_ratio_avg": len_ratio_avg,
      "latency_ratio_avg":latency_ratio_avg,
      "npmi_avg":npmi_avg
  }
  # print(results)
  with open('[RESULTS_BERTOPIC]'+text_filename.replace('txt','json'),'w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(results,indent=4))

In [29]:
# MY CODE
import time

for text_file in text_files[45:]:
  s = time.time()
  eval_bertopic(text_file)
  print(text_file,'|',(time.time() - s) * 1e3,'ms')

Why Are So Many Young Adults Getting Cancer_.txt | 890.1774883270264 ms
Why Ultraprocessed Foods Aren’t Always Bad.txt | 919.0127849578857 ms
Your Brain Has Tricked You Into Thinking Everything Is Worse.txt | 956.3846588134766 ms


In [12]:
# MY CODE
import json
import time
from nltk import bigrams

def eval_nmf(text_filename):
  nmf_topics_dir = '/content/drive/MyDrive/NMF_TOPICS'
  nmf_assigned_topics_dir = '/content/drive/MyDrive/NMF_COMMENTS_TOPICS'
  with open(nmf_topics_dir+'/[NMF_TOPICS_LIST]'+text_filename.replace('txt','json'),'r') as dataset:
    topics = json.load(dataset)

  with open(nmf_assigned_topics_dir+'/[NMF_COMMENTS_TOPICS]'+text_filename.replace('txt','json'),'r') as dataset:
    assigned_topics = json.load(dataset)


  article = ''
  with open(dataset_dir+'/'+text_filename,'r') as file:
    article = file.read()

  with open(dataset_dir+'/'+text_filename.replace('txt','json'),'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  processed_article = process_text(article)
  all_bigrams = list(bigrams(processed_article))

  len_ratios_tc = []
  latency_ratios = []
  for at,comment in zip(assigned_topics,comments):
    # Size Reduction eval
    comment_words_len = len(comment.split(' '))
    topic_words_len = len(topics[at].split(' '))
    len_ratios_tc.append(topic_words_len/comment_words_len)

    # Latency eval
    comment_words = comment.split(' ')
    topic_words = topics[at].split(' ')
    tw_appear = 0
    for topic_word in topic_words:
      tw_appear += int(topic_word in comment_words)
    latency_ratio = tw_appear/topic_words_len
    latency_ratios.append(latency_ratio)

  # Relatedness to article eval using npmi
  npmi_list =[]
  for topic in topics:
    npmi_t = npmi_from_bigrams(topic=topic,all_bigrams=all_bigrams)
    npmi_list.append(npmi_t)

  # Taking average for the eval measures
  len_ratio_avg = sum(len_ratios_tc)/len(len_ratios_tc)
  latency_ratio_avg = sum(latency_ratios)/len(latency_ratios)
  if len(npmi_list):
    npmi_avg = sum(npmi_list)/len(npmi_list)
  else:
    npmi_avg = 0

  # Store results in a dictionary
  results = {
      "len_ratio_avg": len_ratio_avg,
      "latency_ratio_avg":latency_ratio_avg,
      "npmi_avg":npmi_avg
  }
  # print(results)
  with open('[RESULTS_NMF]'+text_filename.replace('txt','json'),'w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(results,indent=4))

In [16]:
# MY CODE
import time

for text_file in text_files[45:]:
  s = time.time()
  eval_nmf(text_file)
  print(text_file,'|',(time.time() - s) * 1e3,'ms')

Why Are So Many Young Adults Getting Cancer_.txt | 1722.9726314544678 ms
Why Ultraprocessed Foods Aren’t Always Bad.txt | 1605.7507991790771 ms
Your Brain Has Tricked You Into Thinking Everything Is Worse.txt | 1601.8147468566895 ms


In [36]:
# MY CODE
import json
import time
from nltk import bigrams

def eval_noun_phrase(text_filename):
  np_topics_dir = '/content/drive/MyDrive/NP_TOPICS'
  with open(np_topics_dir+'/[NP_TOPICS_LIST]'+text_filename.replace('txt','json'),'r') as dataset:
    topics = json.load(dataset)

  article = ''
  with open(dataset_dir+'/'+text_filename,'r') as file:
    article = file.read()

  with open(dataset_dir+'/'+text_filename.replace('txt','json'),'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  processed_article = process_text3(article)
  all_bigrams = list(bigrams(processed_article))

  len_ratios_tc = []
  latency_ratios = []
  npmi_list =[]
  counter = 0
  for comment,topic in zip(comments,topics):
    s = time.time()

    # Size Reduction eval
    comment_words_len = len(comment.split(' '))
    topic_words_len = len(topic.split(' '))
    len_ratios_tc.append(topic_words_len/comment_words_len)

    # Latency eval
    comment_words = comment.split(' ')
    topic_words = topic.split(' ')
    tw_appear = 0
    for topic_word in topic_words:
      tw_appear += int(topic_word in comment_words)
    latency_ratio = tw_appear/topic_words_len
    latency_ratios.append(latency_ratio)

    # Relatedness to article eval using npmi
    npmi_t = npmi_from_bigrams(topic=topic,all_bigrams=all_bigrams)
    npmi_list.append(npmi_t)

    print('counter: ',counter,'|',(time.time() - s) * 1e3,'ms','|',npmi_t)
    counter += 1
  # Taking average for the eval measures
  len_ratio_avg = sum(len_ratios_tc)/len(len_ratios_tc)
  latency_ratio_avg = sum(latency_ratios)/len(latency_ratios)
  if len(npmi_list):
    npmi_avg = sum(npmi_list)/len(npmi_list)
  else:
    npmi_avg = 0

  # Store results in a dictionary
  results = {
      "len_ratio_avg": len_ratio_avg,
      "latency_ratio_avg":latency_ratio_avg,
      "npmi_avg":npmi_avg
  }

  print(results)
  with open('[RESULTS_NP]'+text_filename.replace('txt','json'),'w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(results,indent=4))

In [37]:
eval_noun_phrase(text_files[0])

counter:  0 | 523.5731601715088 ms | 8.247625601933546
counter:  1 | 124.16338920593262 ms | 1.660058429364082
counter:  2 | 9116.269826889038 ms | 60.53871110926426
counter:  3 | 76.77984237670898 ms | 0.26047051497192536
counter:  4 | 6175.531625747681 ms | 47.487818754102136
counter:  5 | 8.322715759277344 ms | 0
counter:  6 | 2551.9168376922607 ms | 30.298668689937063
counter:  7 | 28.209686279296875 ms | 0
counter:  8 | 219.68674659729004 ms | 5.193349249597192
counter:  9 | 339.2021656036377 ms | 2.6408838111602115
counter:  10 | 7084.176063537598 ms | 45.758148360221746
counter:  11 | 918.2825088500977 ms | 4.754936968641758
counter:  12 | 1999.622106552124 ms | 11.403943906377433
counter:  13 | 1618.7713146209717 ms | 14.499846264478348
counter:  14 | 1709.7206115722656 ms | 18.039127892792816
counter:  15 | 631.3624382019043 ms | 4.295769548022923
counter:  16 | 3080.40714263916 ms | 12.586152274434419
counter:  17 | 350.3758907318115 ms | 0.6425609798220773
counter:  18 | 481

# Results Compilation

In [37]:
# MY CODE
def get_article_result(json_filename):
  results_np_dir = '/content/drive/MyDrive/RESULTS_NP'
  with open(results_np_dir+'/[RESULTS_NP]'+json_filename,'r') as dataset:
    results_np = json.load(dataset)

  results_nmf_dir = '/content/drive/MyDrive/RESULTS_NMF'
  with open(results_nmf_dir+'/[RESULTS_NMF]'+json_filename,'r') as dataset:
    results_nmf = json.load(dataset)

  results_bertopic_dir = '/content/drive/MyDrive/RESULTS_BERTOPIC'
  with open(results_bertopic_dir+'/[RESULTS_BERTOPIC]'+json_filename,'r') as dataset:
    results_bertopic = json.load(dataset)

  final_results = {
      'np':results_np,
      'nmf':results_nmf,
      'bertopic':results_bertopic
  }

  return final_results

In [43]:
len(json_files)

46

In [51]:
get_article_result(json_files[1])

{'np': {'len_ratio_avg': 0.41841058312532325,
  'latency_ratio_avg': 0.8506203184083307,
  'npmi_avg': 14.740460032978548},
 'nmf': {'len_ratio_avg': 0.13796337343219348,
  'latency_ratio_avg': 0.1460974922513384,
  'npmi_avg': 2.261050159708133},
 'bertopic': {'len_ratio_avg': 0.1635177781281251,
  'latency_ratio_avg': 0.12793296089385475,
  'npmi_avg': 1.6043790603527}}

In [91]:
# MY CODE
import numpy as np
import pandas as pd

def compile_results():
  len_ratio_table = pd.DataFrame(columns=["Article","NP","NMF","BERTOPIC"])
  latency_ratio_table = pd.DataFrame(columns=["Article","NP","NMF","BERTOPIC"])
  npmi_table = pd.DataFrame(columns=["Article","NP","NMF","BERTOPIC"])

  for file in json_files:
    article_result = get_article_result(file)

    len_ratio_np = round(article_result['np']['len_ratio_avg'],2)
    len_ratio_nmf = round(article_result['nmf']['len_ratio_avg'],2)
    len_ratio_bertopic = round(article_result['bertopic']['len_ratio_avg'],2)

    latency_ratio_np = round(article_result['np']['latency_ratio_avg'],2)
    latency_ratio_nmf = round(article_result['nmf']['latency_ratio_avg'],2)
    latency_ratio_bertopic = round(article_result['bertopic']['latency_ratio_avg'],2)

    npmi_avg_np = round(article_result['np']['npmi_avg'],2)
    npmi_avg_nmf = round(article_result['nmf']['npmi_avg'],2)
    npmi_avg_bertopic = round(article_result['bertopic']['npmi_avg'],2)

    len_ratio_table.loc[len(len_ratio_table)] = [file.replace('.json','').replace('_','')[:50],len_ratio_np,len_ratio_nmf,len_ratio_bertopic]
    latency_ratio_table.loc[len(latency_ratio_table)] = [file.replace('.json','').replace('_','')[:50],latency_ratio_np,latency_ratio_nmf,latency_ratio_bertopic]
    npmi_table.loc[len(npmi_table)] = [file.replace('.json','').replace('_','')[:50],npmi_avg_np,npmi_avg_nmf,npmi_avg_bertopic]

  return len_ratio_table,latency_ratio_table,npmi_table

In [92]:
FINAL_RESULTS = compile_results()

In [86]:
ti = "48 Million Americans Live With Addiction. Here’s How to Get Them Help That Works."
ti[:50]

'48 Million Americans Live With Addiction. Here’s H'

In [103]:
LEN_RATIO_RESULTS = FINAL_RESULTS[0]
LATENCY_RATIO_RESULTS = FINAL_RESULTS[1]
NPMI_RESULTS = FINAL_RESULTS[2]

In [101]:
print(LEN_RATIO_RESULTS.head(10).to_latex(index=False).replace('0000',''))

\begin{tabular}{lrrr}
\toprule
Article & NP & NMF & BERTOPIC \\
\midrule
1,374 Days My Life With Long Covid & 0.40 & 0.18 & 0.24 \\
48 Million Americans Live With Addiction. Here’s H & 0.42 & 0.14 & 0.16 \\
A Year on Ozempic Taught Me We’re Thinking About O & 0.41 & 0.14 & 0.17 \\
Advice From a Psychotherapist on How to Cope Today & 0.36 & 0.21 & 0.26 \\
Alzheimer’s Can Be a World of Endless Second Chanc & 0.36 & 0.17 & 0.14 \\
Are Smartphones Driving Our Teens to Depression & 0.42 & 0.20 & 0.21 \\
Are We Thinking About Obesity All Wrong & 0.42 & 0.13 & 0.20 \\
Deep Inside Mountains, Work Is Getting Much More D & 0.45 & 0.26 & 0.32 \\
Doctors Need a Better Way to Treat Patients Withou & 0.43 & 0.14 & 0.19 \\
Does Gene Editing Have a Future in Reproductive Me & 0.41 & 0.15 & 0.20 \\
\bottomrule
\end{tabular}



In [102]:
print(LATENCY_RATIO_RESULTS.head(10).to_latex(index=False).replace('0000',''))

\begin{tabular}{lrrr}
\toprule
Article & NP & NMF & BERTOPIC \\
\midrule
1,374 Days My Life With Long Covid & 0.82 & 0.11 & 0.17 \\
48 Million Americans Live With Addiction. Here’s H & 0.85 & 0.15 & 0.13 \\
A Year on Ozempic Taught Me We’re Thinking About O & 0.84 & 0.18 & 0.16 \\
Advice From a Psychotherapist on How to Cope Today & 0.80 & 0.10 & 0.06 \\
Alzheimer’s Can Be a World of Endless Second Chanc & 0.81 & 0.15 & 0.12 \\
Are Smartphones Driving Our Teens to Depression & 0.82 & 0.16 & 0.13 \\
Are We Thinking About Obesity All Wrong & 0.83 & 0.13 & 0.11 \\
Deep Inside Mountains, Work Is Getting Much More D & 0.82 & 0.13 & 0.14 \\
Doctors Need a Better Way to Treat Patients Withou & 0.85 & 0.17 & 0.18 \\
Does Gene Editing Have a Future in Reproductive Me & 0.84 & 0.11 & 0.12 \\
\bottomrule
\end{tabular}



In [104]:
print(NPMI_RESULTS.head(10).to_latex(index=False).replace('0000',''))

\begin{tabular}{lrrr}
\toprule
Article & NP & NMF & BERTOPIC \\
\midrule
1,374 Days My Life With Long Covid & 7.02 & 0.63 & 0.70 \\
48 Million Americans Live With Addiction. Here’s H & 14.74 & 2.26 & 1.60 \\
A Year on Ozempic Taught Me We’re Thinking About O & 7.77 & 0.97 & 0.94 \\
Advice From a Psychotherapist on How to Cope Today & 3.24 & 0.52 & 0.00 \\
Alzheimer’s Can Be a World of Endless Second Chanc & 4.11 & 0.28 & 0.80 \\
Are Smartphones Driving Our Teens to Depression & 7.38 & 0.29 & 1.75 \\
Are We Thinking About Obesity All Wrong & 8.95 & 1.63 & 0.74 \\
Deep Inside Mountains, Work Is Getting Much More D & 5.97 & 1.13 & 1.14 \\
Doctors Need a Better Way to Treat Patients Withou & 11.08 & 1.05 & 1.31 \\
Does Gene Editing Have a Future in Reproductive Me & 6.52 & 1.13 & 0.78 \\
\bottomrule
\end{tabular}



In [106]:
print(LEN_RATIO_RESULTS.to_latex(index=True,longtable=True).replace('0000',''))

\begin{longtable}{llrrr}
\toprule
 & Article & NP & NMF & BERTOPIC \\
\midrule
\endfirsthead
\toprule
 & Article & NP & NMF & BERTOPIC \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
0 & 1,374 Days My Life With Long Covid & 0.40 & 0.18 & 0.24 \\
1 & 48 Million Americans Live With Addiction. Here’s H & 0.42 & 0.14 & 0.16 \\
2 & A Year on Ozempic Taught Me We’re Thinking About O & 0.41 & 0.14 & 0.17 \\
3 & Advice From a Psychotherapist on How to Cope Today & 0.36 & 0.21 & 0.26 \\
4 & Alzheimer’s Can Be a World of Endless Second Chanc & 0.36 & 0.17 & 0.14 \\
5 & Are Smartphones Driving Our Teens to Depression & 0.42 & 0.20 & 0.21 \\
6 & Are We Thinking About Obesity All Wrong & 0.42 & 0.13 & 0.20 \\
7 & Deep Inside Mountains, Work Is Getting Much More D & 0.45 & 0.26 & 0.32 \\
8 & Doctors Need a Better Way to Treat Patients Withou & 0.43 & 0.14 & 0.19 \\
9 & Does Gene Editing Have a Future in Reproductive Me & 0.41 & 0

In [107]:
print(LATENCY_RATIO_RESULTS.to_latex(index=True,longtable=True).replace('0000',''))

\begin{longtable}{llrrr}
\toprule
 & Article & NP & NMF & BERTOPIC \\
\midrule
\endfirsthead
\toprule
 & Article & NP & NMF & BERTOPIC \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
0 & 1,374 Days My Life With Long Covid & 0.82 & 0.11 & 0.17 \\
1 & 48 Million Americans Live With Addiction. Here’s H & 0.85 & 0.15 & 0.13 \\
2 & A Year on Ozempic Taught Me We’re Thinking About O & 0.84 & 0.18 & 0.16 \\
3 & Advice From a Psychotherapist on How to Cope Today & 0.80 & 0.10 & 0.06 \\
4 & Alzheimer’s Can Be a World of Endless Second Chanc & 0.81 & 0.15 & 0.12 \\
5 & Are Smartphones Driving Our Teens to Depression & 0.82 & 0.16 & 0.13 \\
6 & Are We Thinking About Obesity All Wrong & 0.83 & 0.13 & 0.11 \\
7 & Deep Inside Mountains, Work Is Getting Much More D & 0.82 & 0.13 & 0.14 \\
8 & Doctors Need a Better Way to Treat Patients Withou & 0.85 & 0.17 & 0.18 \\
9 & Does Gene Editing Have a Future in Reproductive Me & 0.84 & 0