# Embedded Topic Model (ETM)

In [1]:
import spacy
import string
from octis.preprocessing.preprocessing import Preprocessing
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
import torch
import numpy as np
import pandas as pd
import os
from wordcloud import WordCloud
from gensim.models import KeyedVectors

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
os.chdir(parent_directory)

In [3]:
SMALL = False

if(SMALL):
    data_path = 'data/input_small'
    corpus_path = 'data/input_small/corpus.txt'
    label_path = 'data/input_small/labels.txt'
    embs_path = 'data/input_small/embeddings.pkl'
    proc_path = 'data/processed_small/dataset'
else:
    data_path = 'data/input'
    corpus_path = 'data/input/corpus.txt'
    label_path = 'data/input/labels.txt'
    embs_path = 'data/input/embeddings.pkl'
    proc_path = 'data/processed/dataset'

In [4]:
from utils.embeddings import *
from preprocessing.clean_text import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
test = load_vectors('model/cc.en.300.vec/cc.en.300.vec')

Loading vectors:  17%|█▋        | 345675/2000000 [00:18<01:00, 27435.91it/s]

: 

## Data setup

In [None]:
extract_corpus_and_labels_from_songs_csv(csv_input_path = 'data/raw/cleaned_train_lyrics.csv', output_path = data_path,frac=0.25)

## Soft preprocesing & Embeddings

In [None]:
compute_embeddings(corpus_path, embs_path, dim = 100)

In [5]:
embeddings = load_pickle(embs_path)

In [None]:
vectors = {}

for emb in embeddings:
            line = emb.split()
            word = line[0]
            vect = np.array(line[1:]).astype(float)
            vectors[word] = vect

print(len(vectors['the']))

## Preprocessing


In [7]:
preprocessor = Preprocessing(lowercase=True,
                             min_df = 20,
                             max_df = 0.80,
                             remove_punctuation=True,
                             punctuation=string.punctuation,
                             remove_numbers=True,
                             lemmatize= True,
                             stopword_list=CUSTOM_STOPWORDS,
                             min_chars=3,
                             min_words_docs=10,
                             language='english',
                             split=True,
                             verbose=True)

In [8]:

# Checks if dataset folder exists in processed, if not it processes the dataset. Otherwise it loads it
if not os.path.exists(proc_path):
    dataset = preprocessor.preprocess_dataset(documents_path = corpus_path,labels_path = label_path)
    dataset.save(proc_path)
else:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(proc_path)

In [None]:
dataset.get_metadata()

## ETM model without optimization

In [None]:
N_TOPICS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

current_device = torch.cuda.current_device()
print(f"Currently using GPU: {current_device}")
print(f"GPU Name: {torch.cuda.get_device_name(current_device)}")

In [12]:
model = ETM(num_topics= N_TOPICS,  
        num_epochs=10, 
        t_hidden_size=400, 
        rho_size=100, 
        embedding_size=100,
        activation='relu', 
        dropout=0.5, 
        lr=0.005, 
        optimizer='adam', 
        batch_size=64, 
        clip=0.0, 
        wdecay=1.2e-6, 
        bow_norm=1, 
        device=device, 
        train_embeddings=False, 
        embeddings_path= embs_path,
        embeddings_type='pickle', 
        binary_embeddings=True, 
        headerless_embeddings=False, 
        use_partitions=True)
            

In [None]:
output = model.train_model(dataset, top_words=20)

In [14]:
td, ch = TopicDiversity(topk=10), Coherence(texts = dataset.get_corpus(), topk=20, measure = 'c_npmi') # Initialize metric

In [None]:
print("Coherence: ", ch.score(output))
print("Topic Diversity: ", td.score(output))