**Convention:** Code cells using referenced code begin with a comment `# REFERRED`. Code cells containing code I wrote begin with a comment `# MY CODE`. In cells that have both referenced and original code, the respective code parts are labelled.


# Load DATASET

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# MY CODE
import os
import json

dataset_dir = '/content/drive/MyDrive/DATASET'
files = []
for (root,dirs,file) in os.walk(dataset_dir):
  files = file

json_files = []
text_files = []

for file in files:
  if '.json' in file:
    json_files.append(file)
  else:
    text_files.append(file)

print('json files:', len(json_files))
print('text files:', len(text_files))

json files: 48
text files: 48


# Pre-processing

In [4]:
import numpy as np
import pandas as pd

*Reference: https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf*

In [5]:
# REFERRED

## defining all utilty functions - needed for Data cleaning and processing

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
import nltk

# Contraction map
c_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you you will",
    "you'll've": "you you will have",
    "you're": "you are",
    "you've": "you have"
}

# Compiling the contraction dict
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

# List of stop words
add_stop = ['said', 'say', '...', 'like', 'cnn', 'ad']
stop_words = ENGLISH_STOP_WORDS.union(add_stop)

# List of punctuation
punc = list(set(string.punctuation))


# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)


def process_text(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text


def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words - 1:-1]


def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    return pd.DataFrame(topics)


def whitespace_tokenizer(text):
    pattern = r"(?u)\b\w\w+\b"
    tokenizer_regex = RegexpTokenizer(pattern)
    tokens = tokenizer_regex.tokenize(text)
    return tokens


# Funtion to remove duplicate words
def unique_words(text):
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    return ulist


def word_count(text):
    return len(str(text).split(' '))

In [6]:
# REFERRED

# Removing stemming step as it is not required for evaluating BERTopic

def process_text2(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    # text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text

In [7]:
# REFERRED

# Removing punctuations for evaluating noun phrase approach

def process_text3(text):
    text = casual_tokenizer(text)
    # text = [each.lower() for each in text]
    # text = [re.sub('[0-9]+', '', each) for each in text]
    # text = [expandContractions(each, c_re=c_re) for each in text]
    # text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    # text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text

# Noun-Phrase Approach

In [8]:
import spacy

# MY CODE
def get_noun_phrases(text):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)
  noun_phrases = [chunk.text for chunk in doc.noun_chunks if chunk.root.pos_ != 'PRON']
  return noun_phrases

In [9]:
def tm_noun_phase(filename):
  # MY CODE
  with open(dataset_dir+'/'+filename,'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  topic_list_noun_phrase = []
  for comment in comments:
    noun_phrases = get_noun_phrases(comment)
    topic_list_noun_phrase.append(' '.join(noun_phrases))

  with open(f'[NP_TOPICS_LIST]{filename}','w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(topic_list_noun_phrase,indent=4))

In [10]:
# MY CODE
# for file in json_files:
#   try:
#     tm_noun_phase(file)
#   except:
#     print(file)
#     continue

# Evaluation

In [16]:
# MY CODE
from itertools import combinations
import math

def npmi_from_bigrams(topic,all_bigrams,method='sum'):
  total_bigram_count = len(all_bigrams)

  topic_words = topic.split(' ')
  topic_pairs = list(combinations(topic_words,2))

  npmi_list = []
  for (w1,w2) in topic_pairs:
    joint_count_w1_w2 = len([(a,b) for (a,b) in all_bigrams if a == w1 and b == w2])
    joint_count_w2_w1 = len([(a,b) for (a,b) in all_bigrams if b == w1 and a == w2])
    p_i_j = (joint_count_w1_w2/total_bigram_count) + (joint_count_w2_w1/total_bigram_count)
    p_i_star = len([(a,b) for (a,b) in all_bigrams if a == w1])/total_bigram_count
    p_star_j = len([(a,b) for (a,b) in all_bigrams if b == w2])/total_bigram_count

    if p_i_j == 0 or p_i_star == 0 or p_star_j == 0:
      pmi_w1_w2 = 0
      npmi_w1_w2 = 0
      npmi_list.append(npmi_w1_w2)
      continue

    pmi_w1_w2 = math.log(p_i_j/(p_i_star*p_star_j))

    npmi_w1_w2 = pmi_w1_w2/-math.log(p_i_j)
    npmi_list.append(npmi_w1_w2)

  if method == 'sum':
    return sum(npmi_list)
  elif method == 'avg':
    return sum(npmi_list)/len(npmi_list)
  else:
    raise Exception("method can only be 'sum' or 'avg'")

In [13]:
# MY CODE
import json
import time
from nltk import bigrams

def eval_noun_phrase(text_filename):
  np_topics_dir = '/content/drive/MyDrive/NP_TOPICS'
  with open(np_topics_dir+'/[NP_TOPICS_LIST]'+text_filename.replace('txt','json'),'r') as dataset:
    topics = json.load(dataset)

  article = ''
  with open(dataset_dir+'/'+text_filename,'r') as file:
    article = file.read()

  with open(dataset_dir+'/'+text_filename.replace('txt','json'),'r') as dataset:
    data = json.load(dataset)

  comments = data['comments']

  processed_article = process_text3(article)
  all_bigrams = list(bigrams(processed_article))

  len_ratios_tc = []
  latency_ratios = []
  npmi_list =[]
  for comment,topic in zip(comments,topics):


    # Size Reduction eval
    comment_words_len = len(comment.split(' '))
    topic_words_len = len(topic.split(' '))
    len_ratios_tc.append(topic_words_len/comment_words_len)

    # Latency eval
    comment_words = comment.split(' ')
    topic_words = topic.split(' ')
    tw_appear = 0
    for topic_word in topic_words:
      tw_appear += int(topic_word in comment_words)
    latency_ratio = tw_appear/topic_words_len
    latency_ratios.append(latency_ratio)

    # Relatedness to article eval using npmi
    npmi_t = npmi_from_bigrams(topic=topic,all_bigrams=all_bigrams)
    npmi_list.append(npmi_t)

  # Taking average for the eval measures
  len_ratio_avg = sum(len_ratios_tc)/len(len_ratios_tc)
  latency_ratio_avg = sum(latency_ratios)/len(latency_ratios)
  if len(npmi_list):
    npmi_avg = sum(npmi_list)/len(npmi_list)
  else:
    npmi_avg = 0

  # Store results in a dictionary
  results = {
      "len_ratio_avg": len_ratio_avg,
      "latency_ratio_avg":latency_ratio_avg,
      "npmi_avg":npmi_avg
  }

  with open('[RESULTS_NP]'+text_filename.replace('txt','json'),'w',encoding='utf-8') as output_file:
    output_file.write(json.dumps(results,indent=4))

In [18]:
import time

for text_file in text_files[1:]:
  s = time.time()
  eval_noun_phrase(text_file)
  print(text_file,(time.time() - s) * 1e3,'ms')

1,374 Days_ My Life With Long Covid.txt 85070.57905197144 ms
A Year on Ozempic Taught Me We’re Thinking About Obesity All Wrong.txt 128824.21660423279 ms
Advice From a Psychotherapist on How to Cope Today.txt 8946.769952774048 ms
Alzheimer’s Can Be a World of Endless Second Chances.txt 7657.370328903198 ms
Are Smartphones Driving Our Teens to Depression_.txt 52677.24823951721 ms
Are We Thinking About Obesity All Wrong_.txt 226161.64565086365 ms
Deep Inside Mountains, Work Is Getting Much More Dangerous.txt 14252.062559127808 ms
Doctors Need a Better Way to Treat Patients Without Their Consent.txt 45884.09781455994 ms
Does Gene Editing Have a Future in Reproductive Medicine_.txt 26966.88437461853 ms
Finding Light in Winter.txt 13076.559066772461 ms
How to Help Americans Eat Less Junk Food.txt 33583.510398864746 ms
How Virtual Appointments Taught Me to Be a Better Doctor.txt 20600.692749023438 ms
How Well Does Masking Work_ And Other Pandemic Questions We Need to Answer..txt 43526.013612

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/NP_TOPICS/[NP_TOPICS_LIST]The Monster Measles Outbreak in Europe Is a Warning.json'

In [20]:
for i,t in enumerate(text_files):
  print(i,t)

0 48 Million Americans Live With Addiction. Here’s How to Get Them Help That Works..txt
1 1,374 Days_ My Life With Long Covid.txt
2 A Year on Ozempic Taught Me We’re Thinking About Obesity All Wrong.txt
3 Advice From a Psychotherapist on How to Cope Today.txt
4 Alzheimer’s Can Be a World of Endless Second Chances.txt
5 Are Smartphones Driving Our Teens to Depression_.txt
6 Are We Thinking About Obesity All Wrong_.txt
7 Deep Inside Mountains, Work Is Getting Much More Dangerous.txt
8 Doctors Need a Better Way to Treat Patients Without Their Consent.txt
9 Does Gene Editing Have a Future in Reproductive Medicine_.txt
10 Finding Light in Winter.txt
11 How to Help Americans Eat Less Junk Food.txt
12 How Virtual Appointments Taught Me to Be a Better Doctor.txt
13 How Well Does Masking Work_ And Other Pandemic Questions We Need to Answer..txt
14 It’s Not Easy to Tell People You Have Cancer. As a Doctor, I See It All the Time..txt
15 It’s Not Your Imagination. Your Allergies Are Getting Worse.

In [21]:
import time

for text_file in text_files[31:]:
  s = time.time()
  eval_noun_phrase(text_file)
  print(text_file,'|',(time.time() - s) * 1e3,'ms')

The New Age of D.I.Y. Medicine.txt | 28484.822273254395 ms
The Problem Is With Men’s Sperm.txt | 32527.349710464478 ms
The Problem With Saying ‘Sex Assigned at Birth’.txt | 75716.21489524841 ms
The Way You Build Muscle Is the Way You Build a Life.txt | 3867.5150871276855 ms
There’s a Better Way to Talk About Fluoride, Vaccines and Raw Milk.txt | 74556.82635307312 ms
This Diet Buzzword Is Misleading.txt | 5069.25368309021 ms
This Is What It Takes to Get an Abortion in America.txt | 13932.663679122925 ms
We Now Have a Chance to Stop the Most Deadly Infectious Disease — if We Act.txt | 10099.504947662354 ms
We’re Relearning What Pandemics Do to a Society.txt | 8928.40027809143 ms
WeightWatchers Got One Thing Very Right.txt | 47820.4927444458 ms
What Having a Brother With Down Syndrome Has Taught Me About Everyone Else.txt | 1648.813009262085 ms
What We Lose When Pharmacists Are Forced to Act Like Cops.txt | 23281.232118606567 ms


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/NP_TOPICS/[NP_TOPICS_LIST]When There’s a Dearth of Good Information on Women’s Health, a Million Scams Bloom.json'

In [22]:
import time

for text_file in text_files[45:]:
  s = time.time()
  eval_noun_phrase(text_file)
  print(text_file,'|',(time.time() - s) * 1e3,'ms')

Why Are So Many Young Adults Getting Cancer_.txt | 18174.074172973633 ms
Why Ultraprocessed Foods Aren’t Always Bad.txt | 64793.638944625854 ms
Your Brain Has Tricked You Into Thinking Everything Is Worse.txt | 60338.059425354004 ms
