# Embedded Topic Model (ETM)

In [1]:
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import os

In [2]:

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
os.chdir(parent_directory)

In [3]:
from utils.embeddings import *

In [4]:
dataset_path = 'data/raw/cleaned_train_lyrics.csv'
df = pd.read_csv(dataset_path)
df = df.drop(columns = ['Unnamed: 0'])
df = df.rename(columns = {'Lyric':'lyrics'})
df = df.sample(frac=0.001).reset_index(drop=True) # Uncomment this line to sample a fraction of the dataset

## Embeddings

In [5]:
embeddings_file = 'data/input/embeddings.pkl'

In [6]:
embeddings = create_embeddings(df, embeddings_file, force_creation=True, batch_size=64)

Embeddings not found, generating...


config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

cpu
Embeddings saved to data/input/embeddings.pkl


In [7]:
dataset = Dataset()  
dataset.fetch_dataset("BBC_News")
#dataset.load_custom_dataset_from_folder("data/processed") # Our custom preprocessed dataset


In [8]:
dataset.get_metadata()

{'total_documents': 2225,
 'words_document_mean': 120.12,
 'vocabulary_length': 2949,
 'last-training-doc': 1557,
 'last-validation-doc': 1891,
 'preprocessing-info': 'Steps:\n  remove_punctuation\n  lemmatization\n  remove_stopwords\n  filter_words\n  remove_docs\nParameters:\n  removed words with less than 0.005 or more than 0.35 documents with an occurrence of the word in corpus\n  removed documents with less than 5 words',
 'info': {'name': 'BBC_News'},
 'labels': ['business', 'entertainment', 'politics', 'sport', 'tech'],
 'total_labels': 5}

In [9]:
#model = ETM(num_topics = 10, vocab_size=3000, t_hidden_size=800, theta_act = 'relu', embeddings = None, train_embeddings = True, enc_drop = 0.0, rho_size= 5, emb_size= 10)7
model = ETM(num_topics= 20)

In [10]:
output = model.train_model(dataset)

model: ETM(
  (t_drop): Dropout(p=0.5, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=2949, bias=False)
  (alphas): Linear(in_features=300, out_features=20, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=2949, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=20, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=20, bias=True)
)
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 0.06 .. Rec_loss: 929.83 .. NELBO: 929.89
****************************************************************************************************
****************************************************************************************************
VALIDATION .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 209.53 .. NELBO: 209.54
**

In [11]:
td, ch = TopicDiversity(topk=10), Coherence(topk=10, measure = 'c_v') # Initialize metric

In [12]:
print("Coherence: ", ch.score(output))
print("Topic Diversity: ", td.score(output))

Coherence:  0.48960463965790246
Topic Diversity:  0.28
