In [2]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [34]:
# Load data
data = pd.read_csv("./archive/abcnews-date-text.csv")
data = data[:100000]

# Prepare data
unpreprocessed_texts = data['headline_text'].tolist()

vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)

count = vectorizer.fit_transform(unpreprocessed_texts)
# processed_texts = vectorizer.transform(unpreprocessed_texts)

words = vectorizer.get_feature_names_out()
processed_texts_list = [' '.join(words[doc.nonzero()[1]]) for doc in count]



In [4]:

qt = TopicModelDataPreparation("all-mpnet-base-v2")
training_dataset = qt.fit(text_for_contextual=unpreprocessed_texts, text_for_bow=processed_texts_list)

Batches: 100%|██████████| 500/500 [10:06<00:00,  1.21s/it]


In [5]:
# CTM
n_topics = 10
ctm = CombinedTM(bow_size=len(qt.vocab), contextual_size=768, n_components=n_topics, num_epochs=2)

# Train CTM with the prepared dataset
ctm.fit(training_dataset)
ctm_top_words = ctm.get_topics(5)

Epoch: [2/2]	 Seen Samples: [199936/200000]	Train Loss: 48.45850321516948	Time: 0:01:11.062003: : 2it [02:33, 76.90s/it]
100%|██████████| 1563/1563 [00:41<00:00, 37.85it/s]


In [6]:
def print_ctm_top_words(topic_words, n_topics):
    for topic_idx in range(n_topics):
        print("Topic %d:" % topic_idx, end=' ')
        print(' '.join(topic_words[topic_idx]))
        # print('| # topic_count: %d' % ctm.get_topic_count()[topic_idx])

In [7]:
print("CTM Top Words:")
print_ctm_top_words(ctm_top_words, n_topics)

CTM Top Words:
Topic 0: govt pay act health vic
Topic 1: open cup world final tour
Topic 2: anderson beckham rann election crean
Topic 3: rates farmers drought rain trade
Topic 4: court man charged murder trial
Topic 5: police killed injured car missing
Topic 6: sars china road safety toll
Topic 7: iraq war iraqi troops bush
Topic 8: cadet unhealthy airlifted sierra somerset
Topic 9: council plan indigenous new group


In [8]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tokenized_texts = [word_tokenize(text) for text in unpreprocessed_texts]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lmh23\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Dictionary and Coherence Model
def get_coherence_score(documents, topics):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    cm = CoherenceModel(topics=topics, texts=documents, dictionary=dictionary, coherence='c_v')
    return cm.get_coherence()

def get_ctm_topics(model, n_top_words):
    topics = []
    topic_word_matrix = model.get_topic_lists(n_top_words)
    for topic_words in topic_word_matrix:
        topics.append(" ".join(topic_words))
    return topics


# Example usage:
print(get_ctm_topics(ctm, 5))
print("Coherence Score:", get_coherence_score(tokenized_texts, get_ctm_topics(ctm, 5)))


['govt pay act health vic', 'open cup world final tour', 'anderson beckham rann election crean', 'rates farmers drought rain trade', 'court man charged murder trial', 'police killed injured car missing', 'sars china road safety toll', 'iraq war iraqi troops bush', 'cadet unhealthy airlifted sierra somerset', 'council plan indigenous new group']
Coherence Score: 0.350212131425678


In [36]:
# coverage for CTM
topic_distribution = ctm.get_thetas(training_dataset)

topic_presence = (topic_distribution > 0.1).sum(axis=0)
coverage = topic_presence / len(training_dataset)

print("Coverage for CTM:",coverage)


100%|██████████| 1563/1563 [00:39<00:00, 39.87it/s] 

Coverage for CTM: [0.2068  0.18991 0.3686  0.19378 0.16729 0.18565 0.21838 0.19559 0.24956
 0.22758]





In [37]:
print(coverage.mean())

0.22031399999999998
