# Embedded Topic Model (ETM)

In [1]:
import spacy
from octis.preprocessing.preprocessing import Preprocessing
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
import torch
import numpy as np
import pandas as pd
import os

In [2]:

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
os.chdir(parent_directory)

In [3]:
from utils.embeddings import *
from preprocessing.clean_text import *

In [None]:
# Setting up the preprocessing 
nlp = spacy.load('en_core_web_sm')


In [None]:
extract_corpus_and_labels_from_songs_csv(csv_input_path = 'data/raw/cleaned_train_lyrics.csv', output_path = 'data/input')

In [None]:
stopwords_list = list(nlp.Defaults.stop_words)
stopwords_list.extend(CUSTOM_STOPWORDS)

In [None]:
preprocessor = Preprocessing(lowercase=True,
                             min_df = 10,
                             max_df = 0.85,
                             remove_punctuation=True,
                             punctuation=string.punctuation,
                             remove_numbers=True,
                             lemmatize= True,
                             stopword_list=stopwords_list,
                             min_chars=3,
                             min_words_docs=10,
                             language='english',
                             split=True,
                             verbose=True)

In [None]:

# Checks if dataset folder exists in processed, if not it processes the dataset. Otherwise it loads it
if not os.path.exists('data/processed/dataset'):
    dataset = preprocessor.preprocess_dataset(documents_path = 'data/input/corpus.txt',labels_path = 'data/input/labels.txt')
    dataset.save('data/processed/dataset')
else:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder('data/processed/dataset')

## Embeddings

In [5]:
embeddings_file = 'data/input/embeddings.pkl'

In [7]:
embeddings = create_embeddings(df, embeddings_file, force_creation=True, batch_size=64)

Embeddings not found, generating...


pytorch_model.bin:  91%|######### | 398M/438M [00:00<?, ?B/s]

cpu


KeyboardInterrupt: 

In [8]:
dataset.get_metadata()

{'total_documents': 2225,
 'words_document_mean': 120.12,
 'vocabulary_length': 2949,
 'last-training-doc': 1557,
 'last-validation-doc': 1891,
 'preprocessing-info': 'Steps:\n  remove_punctuation\n  lemmatization\n  remove_stopwords\n  filter_words\n  remove_docs\nParameters:\n  removed words with less than 0.005 or more than 0.35 documents with an occurrence of the word in corpus\n  removed documents with less than 5 words',
 'info': {'name': 'BBC_News'},
 'labels': ['business', 'entertainment', 'politics', 'sport', 'tech'],
 'total_labels': 5}

## ETM model without optimization

In [None]:
N_TOPICS = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
#model = ETM(num_topics = 10, vocab_size=3000, t_hidden_size=800, theta_act = 'relu', embeddings = None, train_embeddings = True, enc_drop = 0.0, rho_size= 5, emb_size= 10)7


In [9]:
model = ETM(num_topics= N_TOPICS,  
        num_epochs=100, 
        t_hidden_size=800, 
        rho_size=384, 
        embedding_size=384, 
        activation='relu', 
        dropout=0.5, 
        lr=0.005, 
        optimizer='adam', 
        batch_size=128, 
        clip=0.0, 
        wdecay=1.2e-6, 
        bow_norm=1, 
        device=device, 
        train_embeddings=False, 
        embeddings_path=embeddings_file, 
        embeddings_type='pickle', 
        binary_embeddings=True, 
        headerless_embeddings=False, 
        use_partitions=True)
            

In [10]:
output = model.train_model(dataset, top_words=20)

model: ETM(
  (t_drop): Dropout(p=0.5, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=2949, bias=False)
  (alphas): Linear(in_features=300, out_features=20, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=2949, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=20, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=20, bias=True)
)
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 0.06 .. Rec_loss: 929.83 .. NELBO: 929.89
****************************************************************************************************
****************************************************************************************************
VALIDATION .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 209.53 .. NELBO: 209.54
**

In [11]:
td, ch = TopicDiversity(topk=10), Coherence(topk=20, measure = 'c_v') # Initialize metric

In [12]:
print("Coherence: ", ch.score(output))
print("Topic Diversity: ", td.score(output))

Coherence:  0.48960463965790246
Topic Diversity:  0.28
