<a href="https://colab.research.google.com/github/polinauni/IntroToML/blob/main/04_LM_LDA_Topic_modeling_2024_Polina_Bogdanova.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Topic modeling with LDA**


In [12]:
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [13]:
#This is a Hugging Face dataset with abstracts from arxiv papers
df_articles = pd.read_parquet("hf://datasets/bunkalab/arxiv_topic_modeling/data/train-00000-of-00001.parquet")

In [14]:
df_articles = df_articles[['Categories', 'Abstract']] #I will only need these 2 columns from the dataset

In [15]:
df_articles

Unnamed: 0,Categories,Abstract
0,cs.IR,Knowledge retrieval is one of the major challe...
1,"cs.CR, cs.AI, cs.CY, cs.LG",The increasing sophistication of cyber threats...
2,"cs.CL, cs.AI",The vast collection of Holocaust survivor test...
3,cs.CL,"Over the last decade, similar to other applica..."
4,cs.CL,Large language models (LLMs) with their strong...
...,...,...
1435,"cs.AI, cs.IR",Statistical topic models provide a general dat...
1436,"cs.IR, cs.AI, H.3.3","In this paper, we present an approach to learn..."
1437,"cs.IR, cs.DB, H.2.8; H.3.1; H.4",In this paper we present a novel framework for...
1438,stat.AP,Correction to Annals of Applied Statistics 1 (...


In [16]:
df_articles['Categories'].unique()

array(['cs.IR', 'cs.CR, cs.AI, cs.CY, cs.LG', 'cs.CL, cs.AI', 'cs.CL',
       'cs.SI, F.2.2, I.2.7', 'econ.GN, q-fin.EC', 'cs.SE',
       'cs.SI, cs.CL', 'cs.NE', 'cs.HC, cs.AI', 'cs.CL, cs.LG',
       'cs.CL, cs.AI, cs.CY, cs.LG, cs.SI', 'stat.CO, stat.ML',
       'cs.CL, cs.CE, cs.IR, cs.LG, q-fin.ST',
       'math.NA, cs.IR, cs.LG, cs.NA, eess.SP, stat.ML',
       'cs.CL, cs.SI, econ.GN, physics.soc-ph, q-fin.EC, I.2.7; J.4; H.4.0',
       'cs.CL, cs.AI, cs.LG, I.2.7', 'cs.SE, cs.HC',
       'cs.CL, cs.CY, cs.IR, cs.SI', 'cs.CR, cs.CL, cs.LG',
       'cs.LG, cs.CL', 'cs.AI', 'cs.CL, cs.AI, cs.LG',
       'cs.CL, Topic Modeling, Aviation Safety, Aviation Accident Reports, Machine\n  Learning, LDA, NMF',
       'physics.ed-ph, physics.data-an',
       'cs.CL, cs.AI, cs.HC, cs.LG, q-bio.NC', 'cs.CY, cs.CL, cs.LG',
       'cs.LG, cs.AI', 'cs.CL, cs.IR, cs.LG',
       'cs.IR, cs.AI, cs.LG, stat.CO, H.3.3',
       'cs.CL, cs.HC, cs.IR, cs.LG', 'stat.AP, cs.HC, cs.IR',
       'cs.CL, cs.CY

In [17]:
df_articles['Categories'] = df_articles['Categories'].astype('category') #Manualy converting to category

In [18]:
df_articles['Categories'].cat.categories #Dislaying all categories from the dataset

Index(['astro-ph.SR', 'cond-mat.other, cs.DL, cs.IR, physics.soc-ph',
       'cond-mat.quant-gas, cond-mat.mes-hall, quant-ph',
       'cond-mat.stat-mech, cond-mat.mtrl-sci, physics.comp-ph', 'cs.AI',
       'cs.AI, I.2.7', 'cs.AI, astro-ph.IM, physics.soc-ph', 'cs.AI, cs.CL',
       'cs.AI, cs.CL, cs.DL', 'cs.AI, cs.CL, cs.IR',
       ...
       'stat.ML, cs.LG, stat.AP', 'stat.ML, cs.LG, stat.AP, stat.CO, stat.ME',
       'stat.ML, cs.LG, stat.CO', 'stat.ML, cs.LG, stat.CO, stat.ME',
       'stat.ML, cs.LG, stat.ME', 'stat.ML, cs.SI',
       'stat.ML, stat.AP, stat.CO', 'stat.ML, stat.CO', 'stat.ML, stat.ME',
       'stat.OT'],
      dtype='object', length=505)

In [19]:
%pip install -U gensim --quiet

In [20]:
from pprint import pprint

from gensim import corpora, models
from gensim.utils import simple_preprocess


from gensim.parsing.preprocessing import STOPWORDS

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer #Not necessary as the data is in English
from nltk.stem.porter import *
#from nltk.stem import WordNetLemmatizer

import numpy as np

from random import choice

np.random.seed(1234)

In [24]:
stemmer = SnowballStemmer('english')
english_stop_words = set(stopwords.words('english'))

def stemming(text):
  return stemmer.stem(text)


def preprocess(text):
  result = [stemming(token)
            for token in simple_preprocess(text)
            if token not in english_stop_words and len(token) > 3]
  return result

In [25]:
all_articles = df_articles['Abstract'].to_list()
all_articles[:5]

['Knowledge retrieval is one of the major challenges in building a knowledge-grounded dialogue system. A common method is to use a neural retriever with a distributed approximate nearest-neighbor database to quickly find the relevant knowledge sentences. In this work, we propose an approach that utilizes topic modeling on the knowledge base to further improve retrieval accuracy and as a result, improve response generation. Additionally, we experiment with a large language model, ChatGPT, to take advantage of the improved retrieval performance to further improve the generation results. Experimental results on two datasets show that our approach can increase retrieval and generation performance. The results also indicate that ChatGPT is a better response generator for knowledge-grounded dialogue when relevant knowledge is provided.',
 'The increasing sophistication of cyber threats necessitates proactive measures to identify vulnerabilities and potential exploits. Underground hacking for

In [26]:
processed_docs = list(map(preprocess, all_articles)) #Application of the function defined earlier
processed_docs[:10]

[['knowledg',
  'retriev',
  'major',
  'challeng',
  'build',
  'knowledg',
  'ground',
  'dialogu',
  'system',
  'common',
  'method',
  'neural',
  'retriev',
  'distribut',
  'approxim',
  'nearest',
  'neighbor',
  'databas',
  'quick',
  'find',
  'relev',
  'knowledg',
  'sentenc',
  'work',
  'propos',
  'approach',
  'util',
  'topic',
  'model',
  'knowledg',
  'base',
  'improv',
  'retriev',
  'accuraci',
  'result',
  'improv',
  'respons',
  'generat',
  'addit',
  'experi',
  'larg',
  'languag',
  'model',
  'chatgpt',
  'take',
  'advantag',
  'improv',
  'retriev',
  'perform',
  'improv',
  'generat',
  'result',
  'experiment',
  'result',
  'dataset',
  'show',
  'approach',
  'increas',
  'retriev',
  'generat',
  'perform',
  'result',
  'also',
  'indic',
  'chatgpt',
  'better',
  'respons',
  'generat',
  'knowledg',
  'ground',
  'dialogu',
  'relev',
  'knowledg',
  'provid'],
 ['increas',
  'sophist',
  'cyber',
  'threat',
  'necessit',
  'proactiv',
  'm

In [27]:
dictionary = corpora.Dictionary(processed_docs) #Create a dictionary from tokenized docs

In [28]:
#To take a look

for idx, (k, v) in enumerate(dictionary.iteritems()):
    print(k, v)
    if idx >= 10:
        break

0 accuraci
1 addit
2 advantag
3 also
4 approach
5 approxim
6 base
7 better
8 build
9 challeng
10 chatgpt


In [29]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

Initializing a model with its hyperparameters

In [30]:
#Model hyperparams

filter_tokens_if_container_documents_are_less_than = 15
filter_tokens_if_appeared_percentage_more_than = 0.5
keep_the_first_n_tokens=100000


num_of_topics = 20

In [31]:
lda_model = models.LdaMulticore(bow_corpus,
                                num_topics=num_of_topics,
                                id2word=dictionary,
                                passes=10,
                                workers=2)

In [32]:
for idx, topic in lda_model.print_topics(num_of_topics):
    print(f'Topic: {idx} \t Words: {topic}')

Topic: 0 	 Words: 0.018*"topic" + 0.013*"media" + 0.010*"concept" + 0.009*"social" + 0.008*"model" + 0.008*"analysi" + 0.008*"learn" + 0.006*"discuss" + 0.006*"use" + 0.006*"news"
Topic: 1 	 Words: 0.028*"topic" + 0.026*"model" + 0.014*"document" + 0.011*"user" + 0.011*"inform" + 0.011*"network" + 0.009*"review" + 0.009*"method" + 0.009*"approach" + 0.008*"propos"
Topic: 2 	 Words: 0.028*"topic" + 0.019*"model" + 0.014*"social" + 0.013*"use" + 0.010*"analysi" + 0.010*"covid" + 0.009*"discuss" + 0.009*"tweet" + 0.009*"relat" + 0.009*"media"
Topic: 3 	 Words: 0.019*"polit" + 0.012*"topic" + 0.010*"model" + 0.010*"time" + 0.008*"human" + 0.008*"method" + 0.008*"cyber" + 0.008*"analysi" + 0.007*"speech" + 0.007*"control"
Topic: 4 	 Words: 0.040*"topic" + 0.035*"model" + 0.018*"word" + 0.018*"text" + 0.015*"document" + 0.013*"embed" + 0.012*"use" + 0.011*"base" + 0.011*"method" + 0.011*"task"
Topic: 5 	 Words: 0.012*"analysi" + 0.011*"emot" + 0.010*"theori" + 0.010*"model" + 0.009*"respons"

In [33]:
#Another one with TF/IDF

tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
pprint(tfidf_corpus[0][:10])

[(0, 0.08138653976346368),
 (1, 0.06606178979728394),
 (2, 0.09403048382447626),
 (3, 0.04031769363536956),
 (4, 0.06226947849454815),
 (5, 0.0932271759792415),
 (6, 0.02562775898405267),
 (7, 0.06521233384160291),
 (8, 0.08644740027930713),
 (9, 0.058290083041509344)]


In [34]:
lda_model_tfidf = models.LdaMulticore(tfidf_corpus,
                                      num_topics=num_of_topics,
                                      id2word=dictionary,
                                      passes=10,
                                      workers=4)

In [35]:
for idx, topic in lda_model_tfidf.print_topics(num_of_topics):
    print(f'Topic: {idx} \t Word: {topic}')

Topic: 0 	 Word: 0.004*"narrat" + 0.002*"youtub" + 0.002*"video" + 0.002*"mpox" + 0.002*"hoard" + 0.002*"paradigm" + 0.002*"rescor" + 0.002*"suitabl" + 0.002*"music" + 0.002*"energet"
Topic: 1 	 Word: 0.005*"tensor" + 0.004*"spectral" + 0.003*"moment" + 0.003*"hull" + 0.002*"convers" + 0.002*"cours" + 0.002*"rerank" + 0.002*"utter" + 0.002*"diln" + 0.002*"vntm"
Topic: 2 	 Word: 0.006*"privaci" + 0.005*"llms" + 0.005*"educ" + 0.005*"interview" + 0.004*"layer" + 0.004*"concern" + 0.003*"advanc" + 0.003*"respons" + 0.003*"methodolog" + 0.003*"self"
Topic: 3 	 Word: 0.003*"climat" + 0.003*"patent" + 0.003*"technolog" + 0.002*"jldadmm" + 0.002*"vape" + 0.002*"vontss" + 0.002*"voynich" + 0.002*"contracept" + 0.002*"enstm" + 0.002*"authori"
Topic: 4 	 Word: 0.009*"imag" + 0.005*"item" + 0.004*"scene" + 0.003*"lcsd" + 0.003*"crisp" + 0.003*"microblog" + 0.003*"feder" + 0.003*"multimod" + 0.002*"partial" + 0.002*"chatgpt"
Topic: 5 	 Word: 0.003*"imag" + 0.003*"paragraph" + 0.002*"session" + 0.0

Inferencing

In [36]:
test_doc = choice(range(len(processed_docs)))
processed_docs[test_doc][:50]

['latent',
 'dirichlet',
 'alloc',
 'popular',
 'topic',
 'model',
 'techniqu',
 'academia',
 'less',
 'industri',
 'especi',
 'larg',
 'scale',
 'applic',
 'involv',
 'search',
 'engin',
 'onlin',
 'advertis',
 'system',
 'main',
 'under',
 'reason',
 'topic',
 'model',
 'use',
 'small',
 'scale',
 'use',
 'exampl',
 'largest',
 'model',
 'report',
 'literatur',
 'topic',
 'cover',
 'difficult',
 'long',
 'tail',
 'semant',
 'word',
 'set',
 'paper',
 'show',
 'number',
 'topic',
 'factor',
 'signific',
 'boost',
 'util']

For an original model:

In [37]:
for index, score in sorted(lda_model[bow_corpus[test_doc]], key=lambda tup: -1*tup[1]):
    print(f"Topic match score: {score} \nTopic: {lda_model.print_topic(index, num_of_topics)}")


Topic match score: 0.9391193389892578 
Topic: 0.050*"model" + 0.041*"topic" + 0.018*"data" + 0.012*"distribut" + 0.009*"text" + 0.009*"use" + 0.008*"latent" + 0.007*"dataset" + 0.007*"propos" + 0.007*"learn" + 0.006*"document" + 0.006*"perform" + 0.006*"dirichlet" + 0.006*"base" + 0.006*"process" + 0.006*"cluster" + 0.006*"method" + 0.006*"algorithm" + 0.006*"label" + 0.005*"show"
Topic match score: 0.05277075618505478 
Topic: 0.044*"user" + 0.040*"recommend" + 0.021*"model" + 0.020*"item" + 0.014*"predict" + 0.013*"prefer" + 0.012*"factor" + 0.012*"learn" + 0.012*"system" + 0.012*"intent" + 0.011*"rate" + 0.011*"latent" + 0.011*"aspect" + 0.009*"data" + 0.009*"rank" + 0.009*"propos" + 0.008*"approach" + 0.007*"topic" + 0.007*"base" + 0.007*"featur"


For a TF/IDF model:

In [38]:
for index, score in sorted(lda_model[bow_corpus[test_doc]], key=lambda tup: -1*tup[1]):
    print(f"Topic match score: {score} \nTopic: {lda_model.print_topic(index, num_of_topics)}")


Topic match score: 0.9391206502914429 
Topic: 0.050*"model" + 0.041*"topic" + 0.018*"data" + 0.012*"distribut" + 0.009*"text" + 0.009*"use" + 0.008*"latent" + 0.007*"dataset" + 0.007*"propos" + 0.007*"learn" + 0.006*"document" + 0.006*"perform" + 0.006*"dirichlet" + 0.006*"base" + 0.006*"process" + 0.006*"cluster" + 0.006*"method" + 0.006*"algorithm" + 0.006*"label" + 0.005*"show"
Topic match score: 0.05276940390467644 
Topic: 0.044*"user" + 0.040*"recommend" + 0.021*"model" + 0.020*"item" + 0.014*"predict" + 0.013*"prefer" + 0.012*"factor" + 0.012*"learn" + 0.012*"system" + 0.012*"intent" + 0.011*"rate" + 0.011*"latent" + 0.011*"aspect" + 0.009*"data" + 0.009*"rank" + 0.009*"propos" + 0.008*"approach" + 0.007*"topic" + 0.007*"base" + 0.007*"featur"


In [39]:
print('Perplexity: ', lda_model.log_perplexity(bow_corpus))
print('Perplexity TFIDF: ', lda_model_tfidf.log_perplexity(bow_corpus))

Perplexity:  -7.082879970867767
Perplexity TFIDF:  -7.913023734021282


Visualization

In [40]:
%pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [41]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

bow_lda_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)

pyLDAvis.display(bow_lda_data)

In [42]:
tfidf_lda_data = gensimvis.prepare(lda_model_tfidf, bow_corpus, dictionary)

pyLDAvis.display(bow_lda_data)

  and should_run_async(code)


In [45]:
from gensim.models import CoherenceModel


coherence_bow = CoherenceModel(model=lda_model, corpus=bow_corpus, dictionary=dictionary, texts=processed_docs, coherence='c_v')
coherence_tfidf = CoherenceModel(model=lda_model_tfidf, corpus=tfidf_corpus, dictionary=dictionary, texts=processed_docs, coherence='c_v')

print('BoW Coherence Score:', coherence_bow.get_coherence())
print('TF-IDF Coherence Score:', coherence_tfidf.get_coherence())

  and should_run_async(code)


BoW Coherence Score: 0.3469149371212898
TF-IDF Coherence Score: 0.6376804928540147


As seen above, the two models are very similiar (even though I would expect the TF/IDF to perform better as it managed to extract topics better (code cell 35)). Therefore I wanted to display theit coherence score. Here it is visible that the semantic similiarity among words within one topic significantly better for TF/IDF model.

Trying out on a new document (abstract of a NeurIPS 2024 article):

In [47]:
unseen_document = """How (dis)similar are the learning trajectories of vision-language models and children? Recent modeling work has attempted to understand the gap between models' and humans' data efficiency by constructing models trained on less data, especially multimodal naturalistic data. However, such models are often evaluated on adult-level benchmarks, with limited breadth in language abilities tested, and without direct comparison to behavioral data. We introduce DevBench, a multimodal benchmark comprising seven language evaluation tasks spanning the domains of lexical, syntactic, and semantic ability, with behavioral data from both children and adults. We evaluate a set of vision-language models on these tasks, comparing models and humans not only on accuracy but on their response patterns. Across tasks, models exhibit variation in their closeness to human response patterns, and models that perform better on a task also more closely resemble human behavioral responses. We also examine the developmental trajectory of OpenCLIP over training, finding that greater training results in closer approximations to adult response patterns. DevBench thus provides a benchmark for comparing models to human language development. These comparisons highlight ways in which model and human language learning processes diverge, providing insight into entry points for improving language models."""

bow_vector = dictionary.doc2bow(preprocess(unseen_document))

print("The lda_model output:")
pprint(lda_model[bow_vector])

print("\n\nNicer output:")
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

print("The lda_model_tfidf output:")
pprint(lda_model_tfidf[bow_vector])

print("\n\nNicer output:")
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

The lda_model output:
[(1, 0.059288334),
 (2, 0.105493456),
 (4, 0.22348009),
 (5, 0.055570226),
 (8, 0.25711903),
 (11, 0.101436965),
 (16, 0.14412959),
 (17, 0.04875516)]


Nicer output:
Score: 0.2576430141925812	 Topic: 0.053*"topic" + 0.053*"model" + 0.016*"document" + 0.015*"algorithm" + 0.012*"infer"
Score: 0.22482475638389587	 Topic: 0.040*"topic" + 0.035*"model" + 0.018*"word" + 0.018*"text" + 0.015*"document"
Score: 0.14156310260295868	 Topic: 0.035*"topic" + 0.029*"model" + 0.013*"user" + 0.011*"research" + 0.010*"use"
Score: 0.10591089725494385	 Topic: 0.028*"topic" + 0.019*"model" + 0.014*"social" + 0.013*"use" + 0.010*"analysi"
Score: 0.10050294548273087	 Topic: 0.075*"topic" + 0.047*"model" + 0.013*"text" + 0.012*"document" + 0.011*"method"
Score: 0.06003628298640251	 Topic: 0.028*"topic" + 0.026*"model" + 0.014*"document" + 0.011*"user" + 0.011*"inform"
Score: 0.05603882297873497	 Topic: 0.012*"analysi" + 0.011*"emot" + 0.010*"theori" + 0.010*"model" + 0.009*"respons"
Sc

  and should_run_async(code)
