# Embedded Topic Model (ETM)

In [1]:
import spacy
import string
from octis.preprocessing.preprocessing import Preprocessing
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real
import torch
import numpy as np
import pandas as pd
import os
from wordcloud import WordCloud

In [2]:

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
os.chdir(parent_directory)

In [3]:
from utils.embeddings import *
from preprocessing.clean_text import *

In [None]:
file_path = 'data/raw/cleaned_train_lyrics.csv'
df = pd.read_csv(file_path)
df = df.drop(columns = ['Unnamed: 0'])
df = df.rename(columns = {'Lyric':'lyrics'})
df = df.sample(frac=0.01).reset_index(drop=True)

## Embeddings

In [9]:
embeddings_file = 'data/input/embeddings.pkl'

In [None]:
dataset.get_metadata()

In [None]:
len(dataset.get_corpus())

In [12]:
# get first value of the metadata, containing the number of documents form dict


In [None]:
embeddings = create_embeddings(df, embeddings_file, force_creation=True, batch_size=64)

## Preprocessing


In [4]:
# Setting up the preprocessing 
nlp = spacy.load('en_core_web_sm')


In [None]:
extract_corpus_and_labels_from_songs_csv(csv_input_path = 'data/raw/cleaned_train_lyrics.csv', output_path = 'data/input',frac=0.0001)

In [6]:
stopwords_list = list(nlp.Defaults.stop_words)
stopwords_list.extend(CUSTOM_STOPWORDS)

In [7]:
preprocessor = Preprocessing(lowercase=True,
                             min_df = 10,
                             max_df = 0.85,
                             remove_punctuation=True,
                             punctuation=string.punctuation,
                             remove_numbers=True,
                             lemmatize= True,
                             stopword_list=stopwords_list,
                             min_chars=3,
                             min_words_docs=10,
                             language='english',
                             split=True,
                             verbose=True)

In [8]:

# Checks if dataset folder exists in processed, if not it processes the dataset. Otherwise it loads it
if not os.path.exists('data/processed/dataset'):
    dataset = preprocessor.preprocess_dataset(documents_path = 'data/input/corpus.txt',labels_path = 'data/input/labels.txt')
    dataset.save('data/processed/dataset')
else:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder('data/processed/dataset')

In [None]:
dataset.get_metadata()

## ETM model without optimization

In [None]:
N_TOPICS = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
#model = ETM(num_topics = 10, vocab_size=3000, t_hidden_size=800, theta_act = 'relu', embeddings = None, train_embeddings = True, enc_drop = 0.0, rho_size= 5, emb_size= 10)7


In [9]:
model = ETM(num_topics= N_TOPICS,  
        num_epochs=100, 
        t_hidden_size=800, 
        rho_size=384, 
        embedding_size=384, 
        activation='relu', 
        dropout=0.5, 
        lr=0.005, 
        optimizer='adam', 
        batch_size=128, 
        clip=0.0, 
        wdecay=1.2e-6, 
        bow_norm=1, 
        device=device, 
        train_embeddings=False, 
        embeddings_path=embeddings_file, 
        embeddings_type='pickle', 
        binary_embeddings=True, 
        headerless_embeddings=False, 
        use_partitions=True)
            

In [None]:
output = model.train_model(dataset, top_words=20)

In [11]:
td, ch = TopicDiversity(topk=10), Coherence(topk=20, measure = 'c_v') # Initialize metric

In [None]:
print("Coherence: ", ch.score(output))
print("Topic Diversity: ", td.score(output))