In [1]:
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk 

nltk.download('stopwords') # stopwrods to be removed

text_file = "D:/Research/UBC/covid19/topic_modeling/EU_UK_clean_11.txt" # the file that contains our docuemtns

documents = [line.strip() for line in open(text_file, encoding="utf-8").readlines()] # we load the documents

# simple preprocessing that removes stopwords and punctuation
sp = WhiteSpacePreprocessing(documents, stopwords_language='english') 

# this function returns the pre and the unpre processed documents and a vocab with the most frequent 2K tokens
# these tokens are going to be used to represent the topics
preprocessed_documents, unpreprocessed_documents, vocab, retained_indices = sp.preprocess() 

C:\Users\ahmed.aburaed\Anaconda3\envs\longformer\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\ahmed.aburaed\Anaconda3\envs\longformer\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed.aburaed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

tp = TopicModelDataPreparation("distiluse-base-multilingual-cased") # instantiate a contextualized model

# create the training set
training_dataset = tp.fit(text_for_contextual=unpreprocessed_documents, text_for_bow=preprocessed_documents)

# train the model
# input_size is the size of our vocabulary
# bert_input_size is the size of the embedding that comes from the contextualized model
# n_components is the number of topic we are going to get from the model
# n_epochs is the number of epochs we are going to train our model for
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, n_components=5, num_epochs=100)
ctm.fit(training_dataset) # run the model

Batches:   0%|          | 0/199 [00:00<?, ?it/s]

Epoch: [100/100]	 Seen Samples: [3961600/3964600]	Train Loss: 80.7786277314958	Time: 0:00:46.960798: : 100it [1:21:09, 48.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 620/620 [00:46<00:00, 13.44it/s]


In [3]:
ctm.get_topic_lists(30)

[['diagnosed',
  'sleep',
  'fearnley',
  'time',
  '12mths',
  'neurological',
  'counting',
  'word',
  'little',
  'really',
  'think',
  'get',
  'people',
  'long',
  'walking',
  'covid',
  'babyrelentless',
  'dr2nisreenalwan',
  'stairs',
  'positive',
  'winded',
  'conception',
  'thinking',
  'everyone',
  'ruin',
  'testing',
  'plea',
  'terrible',
  '000'],
 ['lockdown',
  'double',
  '15',
  'single',
  'tests',
  'person',
  'clinics',
  'community',
  'live',
  'broken',
  'small',
  'thanks',
  'spent',
  'daughter',
  'gp',
  'asking',
  'hours',
  'advice',
  'diagnosis',
  '2022',
  'treatments',
  'personal',
  'hopefully',
  'microclots',
  'tired',
  'gets',
  'referred',
  'son',
  'pots',
  'skypanda476'],
 ['bad',
  'yates',
  'imagine',
  'taking',
  'maths',
  'kit',
  'reducing',
  'mitigations',
  'immunity',
  'end',
  'herd',
  'return',
  'telling',
  'told',
  'airborne',
  'implementing',
  'vaccines',
  'sars',
  'achieving',
  'delta',
  'amalgamqu

In [4]:
with open('./EU_UK-december-2021/topics.txt','wt') as f:
    f.write('[[\'')
    [f.write('\',\n'.join([str(it) for it in item])+'\'],\n[\'') for item in ctm.get_topic_lists(30)]
    f.write(']]')

In [5]:
ctm.save(models_dir="./EU_UK-december-2021")



In [6]:
ctm.load("./EU_UK-december-2021/contextualized_topic_model_nc_5_tpm_0.0_tpv_0.8_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99", epoch=99)



In [7]:
testing_text_for_contextual = [
    "hola, bienvenido",
]

testing_dataset = tp.transform(text_for_contextual=testing_text_for_contextual)

# n_sample how many times to sample the distribution (see the doc)
ctm.get_doc_topic_distribution(testing_dataset, n_samples=5)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.41s/it]


array([[0.06424941, 0.5361956 , 0.26879704, 0.08240427, 0.04835371]],
      dtype=float32)

In [8]:
#ctm.get_wordcloud(topic_id=19, n_words=20)

In [9]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI

with open('D:/Research/UBC/covid19/topic_modeling/EU_UK_clean_11.txt', "r", encoding="utf8") as fr:
    texts = [doc.split() for doc in fr.read().splitlines()] # load text for NPMI

npmi = CoherenceNPMI(texts=texts, topics=ctm.get_topic_lists(5))
npmi.score(topk=5)

-0.18295186963533733

In [10]:
from contextualized_topic_models.evaluation.measures import CoherenceCV

with open('D:/Research/UBC/covid19/topic_modeling/EU_UK_clean_11.txt', "r", encoding="utf8") as fr:
    texts = [doc.split() for doc in fr.read().splitlines()] # load text for CoherenceCV

cv = CoherenceCV(texts=texts, topics=ctm.get_topic_lists(5))
cv.score(topk=5)

0.44063373762603186

In [11]:
from contextualized_topic_models.evaluation.measures import CoherenceWordEmbeddings

we = CoherenceWordEmbeddings(topics=ctm.get_topic_lists(5))
we.score(topk=5)

0.109532535

In [12]:
from contextualized_topic_models.evaluation.measures import TopicDiversity

td = TopicDiversity(topics=ctm.get_topic_lists(5))
td.score(topk=5)

1.0

In [13]:
from contextualized_topic_models.evaluation.measures import InvertedRBO

irbo = InvertedRBO(topics=ctm.get_topic_lists(5))
irbo.score(topk=5)

1.0

In [14]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=5)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

100%|████████████████████████████████████████████████████████████████████████████████| 620/620 [01:06<00:00,  9.37it/s]


In [15]:
vis.save_html(ctm_pd, './EU_UK-december-2021/EU_UK-december-2021ctm5t100e.html')

  vis.save_html(ctm_pd, './EU_UK-december-2021/EU_UK-december-2021ctm5t100e.html')
