# Notes

1. Figure out what command BERTopic uses for (weight, word) pair in the topic and do the same thing as mentioned for LDA and CTM

2. Why are the topics so monolingual -- this really needs to be answered --> maybe the embedding model could be changed --> take the vector embeddings for each language and subtract off the mean for all the sentence embeddings for that language


Steps for the step 2 above:
1. Embed each of my sentences (so we have an embedding for each of the sentences)
2. Partition by language
3. Take the average embedding for each language (centroid)
4. When we retrieve the embedding for a given sentence, subtract off the average for that language
5. Use this embedding for the clustering instead

# Installations

In [23]:
# !pip install contextualized-topic-models==2.3.0
# !pip install datasets
# !pip install pyldavis
# !pip install bertopic

In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords as stop_words
from datasets import load_dataset
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import string
from gensim.utils import deaccent
import warnings
from sentence_transformers import SentenceTransformer
import scipy.sparse
from contextualized_topic_models.datasets.dataset import CTMDataset
from sklearn.preprocessing import OneHotEncoder
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.models.ctm import ZeroShotTM
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import common_texts
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO
from bertopic import BERTopic

# Variables Changing

In [136]:
is_combined = True
perc = 0.10 # 10% of dataset

nltk.download('stopwords')

language = 'en' if is_combined else 'all_languages'
stopwords = list(stop_words.words('english')) if is_combined else list(stop_words.words(stop_words.fileids())) # from every language
embedding_model = 'paraphrase-distilroberta-base-v2' if is_combined else 'paraphrase-multilingual-mpnet-base-v2'

num_epochs_ctm = 20
num_topics = 50
num_topics_word = 700

[nltk_data] Downloading package stopwords to /home/arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset

In [3]:
dataset = load_dataset('amazon_reviews_multi', language)

Found cached dataset amazon_reviews_multi (/home/arnav/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def data(part):
  data = dataset[part].to_pandas()
  review_body = list(data['review_body'])
  return data, review_body

In [5]:
train, review_body = data('train')
test, review_body_test = data('test')
validation, review_body_validation = data('validation')

In [6]:
def make_review_body(data, perc):
  arr = sklearn.utils.shuffle(np.arange(len(data)), random_state=42)[0:int(len(data) * perc)]
  temp = data.iloc[arr].reset_index(drop=True)
  review_body = list(temp['review_body'])
  temp = temp.reset_index()
  temp = temp.rename(columns = {'index': 'message_id', 'review_body': 'message'})
  return temp, review_body

In [7]:
temp_train, review_body = make_review_body(train, perc)
temp_test, test_review_body = make_review_body(test, 1)
temp_validation, validation_review_body = make_review_body(validation, 1)

## SQL to run DLATK Mallet LDA

In [79]:
# temp_train = pd.concat([temp_train, temp_test])
# review_body = pd.concat([pd.Series(review_body), pd.Series(test_review_body)])

# temp_train_to_save = pd.concat([temp_train, temp_test])[['message_id', 'message']]
# temp_test_to_save = temp_test[['message_id', 'message']]
# temp_validation_to_save = temp_validation[['message_id', 'message']]

In [80]:
# temp_train_to_save.to_csv('amazon_multi_train_ten.csv')
# temp_test_to_save.to_csv('amazon_multi_test_ten.csv')
# temp_validation_to_save.to_csv('amazon_multi_validation_ten.csv')

# temp_train_to_save.to_csv('amazon_eng_train_ten.csv')
# temp_test_to_save.to_csv('amazon_eng_test_ten.csv')
# temp_validation_to_save.to_csv('amazon_eng_validation_ten.csv')

In [3]:
# import pandas as pd
# import sqlalchemy

# temp_train_from_save = pd.read_csv('amazon_eng_train_ten.csv')
# temp_train_from_save = temp_train_from_save[['message_id', 'message']]

# # temp_test_from_save = pd.read_csv('amazon_eng_test_ten.csv')
# # temp_test_from_save = temp_test_from_save[['message_id', 'message']]

# # temp_validation_from_save = pd.read_csv('amazon_eng_validation_ten.csv')
# # temp_validation_from_save = temp_validation_from_save[['message_id', 'message']]

# db = sqlalchemy.engine.url.URL(drivername='mysql', host='127.0.0.1', database='arnav', query={'read_default_file': '~/.my.cnf', 'charset':'utf8mb4'})
# engine = sqlalchemy.create_engine(db)

# temp_train_from_save.to_sql('amazon_eng_train_ten', engine, if_exists='replace', index=False) 
# # temp_test_from_save.to_sql('amazon_eng_test_ten', engine, if_exists='replace', index=False) 
# # temp_validation_from_save.to_sql('amazon_eng_validation_ten', engine, if_exists='replace', index=False) 

# Preprocessing


In [8]:
class WhiteSpacePreprocessingStopwords():
    """
    Provides a very simple preprocessing script that filters infrequent tokens from text
    """

    def __init__(self, documents, stopwords_list=None, vocabulary_size=2000, max_df=1.0, min_words=1,
                 remove_numbers=True):
        """

        :param documents: list of strings
        :param stopwords_list: list of the stopwords to remove
        :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
        :param max_df : float or int, default=1.0
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float in range [0.0, 1.0], the parameter represents a proportion of
        documents, integer absolute counts.
        This parameter is ignored if vocabulary is not None.
        :param min_words: int, default=1. Documents with less words than the parameter
        will be removed
        :param remove_numbers: bool, default=True. If true, numbers are removed from docs
        """
        self.documents = documents
        if stopwords_list is not None:
            self.stopwords = set(stopwords_list)
        else:
            self.stopwords = []

        self.vocabulary_size = vocabulary_size
        self.max_df = max_df
        self.min_words = min_words
        self.remove_numbers = remove_numbers

    def preprocess(self):
        """
        Note that if after filtering some documents do not contain words we remove them. That is why we return also the
        list of unpreprocessed documents.

        :return: preprocessed documents, unpreprocessed documents and the vocabulary list
        """
        preprocessed_docs_tmp = self.documents
        preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [doc.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
        if self.remove_numbers:
            preprocessed_docs_tmp = [doc.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
                                     for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
                                 for doc in preprocessed_docs_tmp]

        vectorizer = CountVectorizer(max_features=self.vocabulary_size, max_df=self.max_df)
        vectorizer.fit_transform(preprocessed_docs_tmp)
        temp_vocabulary = set(vectorizer.get_feature_names_out())

        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                                 for doc in preprocessed_docs_tmp]

        preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
        for i, doc in enumerate(preprocessed_docs_tmp):
            if len(doc) > 0 and len(doc) >= self.min_words:
                preprocessed_docs.append(doc)
                unpreprocessed_docs.append(self.documents[i])
                retained_indices.append(i)

        vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

        return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices


# TopicModelDataPreparation

In [9]:
def get_bag_of_words(data, min_length):
    """
    Creates the bag of words
    """
    vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length)
            for x in data if np.sum(x[x != np.array(None)]) != 0]

    vect = scipy.sparse.csr_matrix(vect)
    return vect


def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200, max_seq_length=None):
    """
    Creates SBERT Embeddings from an input file, assumes one document per line
    """

    model = SentenceTransformer(sbert_model_to_load)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length

    with open(text_file, encoding="utf-8") as filino:
        texts = list(map(lambda x: x, filino.readlines()))

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))


def bert_embeddings_from_list(texts, sbert_model_to_load, batch_size=200, max_seq_length=None):
    """
    Creates SBERT Embeddings from a list
    """
    model = SentenceTransformer(sbert_model_to_load)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))


def check_max_local_length(max_seq_length, texts):
    max_local_length = np.max([len(t.split()) for t in texts])
    if max_local_length > max_seq_length:
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(f"the longest document in your collection has {max_local_length} words, the model instead "
                      f"truncates to {max_seq_length} tokens.")


class TopicModelDataPreparation:

    def __init__(self, contextualized_model=None, show_warning=True, max_seq_length=128):
        self.contextualized_model = contextualized_model
        self.vocab = []
        self.id2token = {}
        self.vectorizer = None
        self.label_encoder = None
        self.show_warning = show_warning
        self.max_seq_length = max_seq_length

    def load(self, contextualized_embeddings, bow_embeddings, id2token, labels=None):
        return CTMDataset(
            X_contextual=contextualized_embeddings, X_bow=bow_embeddings, idx2token=id2token, labels=labels)

    def fit(self, text_for_contextual, text_for_bow, labels=None, custom_embeddings=None):
        """
        This method fits the vectorizer and gets the embeddings from the contextual model

        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param custom_embeddings: np.ndarray type object to use custom embeddings (optional).
        :param labels: list of labels associated with each document (optional).
        """

        if custom_embeddings is not None:
            assert len(text_for_contextual) == len(custom_embeddings)

            if text_for_bow is not None:
                assert len(custom_embeddings) == len(text_for_bow)

            if type(custom_embeddings).__module__ != 'numpy':
                raise TypeError("contextualized_embeddings must be a numpy.ndarray type object")

        if text_for_bow is not None:
            assert len(text_for_contextual) == len(text_for_bow)

        if self.contextualized_model is None and custom_embeddings is None:
            raise Exception("A contextualized model or contextualized embeddings must be defined")

        # TODO: this count vectorizer removes tokens that have len = 1, might be unexpected for the users
        self.vectorizer = CountVectorizer()

        train_bow_embeddings = self.vectorizer.fit_transform(text_for_bow)

        # if the user is passing custom embeddings we don't need to create the embeddings using the model

        if custom_embeddings is None:
            train_contextualized_embeddings = bert_embeddings_from_list(
                text_for_contextual, sbert_model_to_load=self.contextualized_model, max_seq_length=self.max_seq_length)
        else:
            train_contextualized_embeddings = custom_embeddings
        self.vocab = self.vectorizer.get_feature_names_out()
        self.id2token = {k: v for k, v in zip(range(0, len(self.vocab)), self.vocab)}

        if labels:
            self.label_encoder = OneHotEncoder()
            encoded_labels = self.label_encoder.fit_transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None
        return CTMDataset(
            X_contextual=train_contextualized_embeddings, X_bow=train_bow_embeddings,
            idx2token=self.id2token, labels=encoded_labels)

    def transform(self, text_for_contextual, text_for_bow=None, custom_embeddings=None, labels=None):
        """
        This method create the input for the prediction. Essentially, it creates the embeddings with the contextualized
        model of choice and with trained vectorizer.

        If text_for_bow is missing, it should be because we are using ZeroShotTM

        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param custom_embeddings: np.ndarray type object to use custom embeddings (optional).
        :param labels: list of labels associated with each document (optional).
        """

        if custom_embeddings is not None:
            assert len(text_for_contextual) == len(custom_embeddings)

            if text_for_bow is not None:
                assert len(custom_embeddings) == len(text_for_bow)

        if text_for_bow is not None:
            assert len(text_for_contextual) == len(text_for_bow)

        if self.contextualized_model is None:
            raise Exception("You should define a contextualized model if you want to create the embeddings")

        if text_for_bow is not None:
            test_bow_embeddings = self.vectorizer.transform(text_for_bow)
        else:
            # dummy matrix
            if self.show_warning:
                warnings.simplefilter('always', DeprecationWarning)
                warnings.warn(
                    "The method did not have in input the text_for_bow parameter. This IS EXPECTED if you "
                    "are using ZeroShotTM in a cross-lingual setting")

            # we just need an object that is matrix-like so that pytorch does not complain
            test_bow_embeddings = scipy.sparse.csr_matrix(np.zeros((len(text_for_contextual), 1)))

        if custom_embeddings is None:
            test_contextualized_embeddings = bert_embeddings_from_list(
                text_for_contextual, sbert_model_to_load=self.contextualized_model, max_seq_length=self.max_seq_length)
        else:
            test_contextualized_embeddings = custom_embeddings

        if labels:
            encoded_labels = self.label_encoder.transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None

        return CTMDataset(X_contextual=test_contextualized_embeddings, X_bow=test_bow_embeddings,
                          idx2token=self.id2token, labels=encoded_labels)


# Modeling

In [10]:
documents = [line.strip() for line in review_body]
test_documents = [line.strip() for line in test_review_body]

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()
labels = temp_train['stars'][retained_indices]

In [None]:
# PARAMS
# data: testing dataset

# RETURN
# preprocessed_documents: documents after passed through preprocessing steps
# labels: corresponding labels to ^

def test_ready(data):
  documents = [line.strip() for line in data['review_body']]
  sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
  preprocessed_documents, _, _, retained_indices = sp.preprocess()
  labels = temp_test['stars'][retained_indices]
  return preprocessed_documents, labels

test_preprocessed_documents, test_labels = test_ready(test)

## BERTopic

In [137]:
bert_topic_model = BERTopic(top_n_words = num_topics_word, nr_topics=num_topics).fit(documents + test_documents) if language!='en' else BERTopic(language = 'English', top_n_words = num_topics_word, nr_topics=num_topics).fit(documents + test_documents)

In [138]:
bert_topic_loglik = [{i: j for (i, j) in bert_topic_model.get_topic(k-1)} for k in range(num_topics)]
bert_topic_topics = [list(i.keys()) for i in bert_topic_loglik]

In [139]:
len(bert_topic_topics[0])

700

## CTM

In [None]:
tp = TopicModelDataPreparation(embedding_model)
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

In [None]:
warnings.filterwarnings('ignore')

# ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics, num_epochs=num_epochs_ctm) if is_combined else ZeroShotTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics, num_epochs=num_epochs_ctm)
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics, num_epochs=num_epochs_ctm) # testing something out
ctm.fit(training_dataset)

Epoch: [21/50]	 Seen Samples: [1667967/3971350]	Train Loss: 126.76303104005511	Time: 0:02:18.294483: : 21it [45:21, 129.36s/it]

In [None]:
ctm_topics = ctm.get_topic_lists(num_topics_word)
documents = preprocessed_documents
labels = temp_train['stars'][retained_indices]

In [None]:
ctm_topics

## LDA (GS)

In [None]:
split_preprocessed_documents = [d.split() for d in preprocessed_documents]
dictionary = Dictionary(split_preprocessed_documents)
corpus = [dictionary.doc2bow(text) for text in split_preprocessed_documents]

lda = LdaModel(corpus, num_topics=num_topics, iterations=500, random_state=42, minimum_probability = 0)

In [None]:
def get_topics_lda(topk=10):
  topic_terms = []
  for i in range(num_topics):
      topic_words_list = []
      for word_tuple in lda.get_topic_terms(i, topk):
          topic_words_list.append(dictionary[word_tuple[0]])
      topic_terms.append(topic_words_list)
  return topic_terms

lda_topics = get_topics_lda(num_topics_word)

In [None]:
lda_topics

## Mallet LDA

In [39]:
# the actual code for Mallet LDA is run in the terminal using the following 2 commands:

# ./dlatkInterface.py -d arnav -t amazon_eng_train_ten -c message_id --add_ngrams -n 1
# ./dlatkInterface.py -d arnav -t amazon_eng_train_ten -c message_id -f 'feat$1gram$amazon_eng_train_ten$message_id' --estimate_lda_topics --lda_lexicon_name mtm_50 --mallet_path /home/sharath/mallet-2.0.8/bin/mallet --num_stopwords 100 --num_topics 50 --language en --save_lda_files amazon_eng

lda_loglik = []
with open("dlatk/amazon_eng/lda.topicGivenWord.csv") as f:
  for line in f:
    temp = line.strip().split('"')
    if len(temp) > 1:
      for i in range(len(temp)):
        if i % 2 != 0:
          temp[i] = ("".join(temp[i].split(",")))
      temp = ["".join(temp)]
    lda_loglik.append(temp[0].split(",")[1:])

lda_loglik = lda_loglik[1:]

mallet_lda_topics = [[x for i, x in enumerate(j) if i % 2 == 0] for j in lda_loglik]

mallet_lda_loglik = []
for k in lda_loglik:
    mallet_lda_loglik.append({k[j]: float(k[j+1]) for j in range(0, len(k) - 1, 2)})

# Evaluating

In [140]:
# PARAMS
# documents: list of documents (each element in the list is a string) that the topics were extracted from
# topics: list of list of topics from the model of choice
# weights: list of dictionary mappings (word: weight)

# RETURN
# topic distribution

def preprocess(documents, topics, weights):
    # Initialize distribution matrix
    distribution = np.zeros((len(documents), len(topics)))

    for i, document in enumerate(documents):
        # Preprocess document
        document = document.translate(str.maketrans('', '', string.punctuation)) # removing periods, commas, etc
        document = document.split(' ') # split on spaces
        document = [word.lower() for word in document if len(word.lower()) > 0] # lower case everything since all topics are lower case

        for j, loglik_dict in enumerate(weights):
            distribution[i][j] = np.sum([0 if word not in loglik_dict else loglik_dict[word] for word in document]) # if word exists then its weight else 0

        # Normalize
        distribution[i] /= (len(document) + 2) # +2 for some weird reason ?
        
    return distribution


# PARAMS
# X: distribution from preprocess() above
# y: stars ('labels')

# RETURN
# LR model

def train_regression_model(X, y):
    model = LinearRegression()
    # model = LogisticRegression()
    model.fit(X, y)
    return model


# PARAMS
# model: model trained from train_regression_model()
# X_test: distribution you want to make predictions on
# y_test: the true labels for the X_test passed in

# RETURN
# MSE: MSE of the (X_test, y_test) data inputted
# RMSE: RMSE of the (X_test, y_test) data inputted
# R2: R2 of the (X_test, y_test) data inputted
# MAE: MAE of the (X_test, y_test) data inputted

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    # from sklearn.metrics import accuracy_score
    # return accuracy_score(y_test, y_pred)

    mse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=True)
    rmse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)
    r2 = r2_score(y_true=y_test, y_pred=y_pred)
    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)

    return mse, rmse, r2, mae


# PARAMS
# topics: list of list of topics
# weights: list of dictionary mappings (word: weight)
# flag: True if baseline (default is False

# RETURN
# r2_train: R2 on train set
# mae_train: MAE on train set
# mse_train: MSE on train set
# rmse_train: RMSE on train set
# r2_test: R2 on test set
# mae_test: MAE on test set
# mse_test: MSE on test set
# rmse_test: RMSE on test set

def results(topics, weights, flag=False):
  if flag:
    X = np.array([labels.mean()] * len(labels)).reshape(-1, 1)
    X_test = np.array([test_labels.mean()] * len(test_labels)).reshape(-1, 1)

  else:
    X = preprocess([line.strip() for line in review_body], topics, weights)
    X_test = preprocess([line.strip() for line in test_review_body], topics, weights)

  y = temp_train['stars']
  y_test = temp_test['stars']
  model = train_regression_model(X, y)

  mse_test, rmse_test, r2_test, mae_test = evaluate_model(model, X_test, y_test)
  mse_train, rmse_train, r2_train, mae_train = evaluate_model(model, X, y)

  # train_acc = evaluate_model(model, X, y)
  # test_acc = evaluate_model(model, X_test, y_test)

  # print("Train Accuracy: ", train_acc)
  # print("Test Accuracy: ", test_acc)

  # return train_acc, test_acc

  print("Train")
  print("R2:", r2_train)
  print("MAE:", mae_train)
  print("MSE:", mse_train)
  print("RMSE:", rmse_train)

  print()

  print("Test")
  print("R2:", r2_test)
  print("MAE:", mae_test)
  print("MSE:", mse_test)
  print("RMSE:", rmse_test)

  return r2_train, mae_train, mse_train, rmse_train, r2_test, mae_test, mse_test, rmse_test

## Mallet LDA

In [128]:
mallet_r2_train, mallet_mae_train, mallet_mse_train, mallet_rmse_train, mallet_r2_test, mallet_mae_test, mallet_mse_test, mallet_rmse_test = results(mallet_lda_topics, mallet_lda_loglik)

Train
R2: 0.28878903518138765
MAE: 0.9945251291336966
MSE: 1.4338984404898703
RMSE: 1.1974549847446752

Test
R2: 0.2709251968971472
MAE: 1.0147344275351098
MSE: 1.4581496062057056
RMSE: 1.2075386561951984


In [69]:
pd.concat([pd.Series([mallet_r2_train, mallet_mae_train, mallet_mse_train, mallet_rmse_train, mallet_r2_test, mallet_mae_test, mallet_mse_test, mallet_rmse_test])], axis = 1).T

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.288789,0.994525,1.433898,1.197455,0.270925,1.014734,1.45815,1.207539


## Baseline

In [None]:
b_r2_train, b_mae_train, b_mse_train, b_rmse_train, b_r2_test, b_mae_test, b_mse_test, b_rmse_test = results(lda_topics, True)

## LDA (GS)

In [None]:
lda_r2_train, lda_mae_train, lda_mse_train, lda_rmse_train, lda_r2_test, lda_mae_test, lda_mse_test, lda_rmse_test = results(lda_topics)

In [None]:
pd.concat([pd.Series([lda_r2_train, lda_mae_train, lda_mse_train, lda_rmse_train, lda_r2_test, lda_mae_test, lda_mse_test, lda_rmse_test])], axis = 1).T

## CTM

In [None]:
ctm_r2_train, ctm_mae_train, ctm_mse_train, ctm_rmse_train, ctm_r2_test, ctm_mae_test, ctm_mse_test, ctm_rmse_test = results(ctm_topics)

In [None]:
pd.concat([pd.Series([ctm_r2_train, ctm_mae_train, ctm_mse_train, ctm_rmse_train, ctm_r2_test, ctm_mae_test, ctm_mse_test, ctm_rmse_test])], axis = 1).T

Using paraphrase-multilingual-mpnet-base-v2, 200 topics, 50 epochs (best one so far):

* MSE: 1.0463393882784946
* RMSE: 1.0229073214512128

## BERTopic

In [141]:
bert_r2_train, bert_mae_train, bert_mse_train, bert_rmse_train, bert_r2_test, bert_mae_test, bert_mse_test, bert_rmse_test = results(bert_topic_topics, bert_topic_loglik)

Train
R2: 0.2930184852329968
MAE: 0.9970628795033796
MSE: 1.4253712915381116
RMSE: 1.1938891454143101

Test
R2: 0.2929389647548669
MAE: 0.9925682984577142
MSE: 1.4141220704902662
RMSE: 1.1891686467823923


In [142]:
pd.concat([pd.Series([bert_r2_train, bert_mae_train, bert_mse_train, bert_rmse_train, bert_r2_test, bert_mae_test, bert_mse_test, bert_rmse_test])], axis = 1).T

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.293018,0.997063,1.425371,1.193889,0.292939,0.992568,1.414122,1.189169


# Printing out topics and LR coefficients

In [43]:
def topic_LR_coef(model, topics):
  x1 = []
  x2 = pd.Series(model.coef_)

  for i, topic in enumerate(topics):
    x1.append(", ".join(topic))

  x1 = pd.Series(x1)
  return (pd.concat([x1, x2], axis=1))

In [143]:
# lda_model = train_regression_model(preprocess(documents, lda_topics), labels)
# ctm_model = train_regression_model(preprocess(documents, ctm_topics), labels)
# mallet_lda_model = train_regression_model(preprocess(documents, mallet_lda_topics, mallet_lda_loglik), temp_train['stars'])
bert_model = train_regression_model(preprocess(documents, bert_topic_topics, bert_topic_loglik), temp_train['stars'])

In [144]:
# lda_LR_coef = topic_LR_coef(lda_model, lda_topics)
# ctm_LR_coef = topic_LR_coef(ctm_model, ctm_topics)
# mallet_LR_coef = topic_LR_coef(mallet_lda_model, mallet_lda_topics)
bert_LR_coef = topic_LR_coef(bert_model, bert_topic_topics)

In [None]:
for i in range(0, 25):
  print(mallet_LR_coef[0][i])    

In [None]:
for i in range(25, 50):
  print(mallet_LR_coef[0][i])   

In [None]:
# lda_LR_coef

In [None]:
# ctm_LR_coef

In [145]:
for i in range(0, 25):
  print(bert_LR_coef[0][i])   

the, it, to, and, for, is, not, this, of, in, but, my, was, on, that, with, have, they, so, you, very, as, product, one, are, be, use, great, them, like, had, when, would, just, all, good, out, if, can, these, up, at, will, well, or, get, after, no, work, too, only, off, from, me, what, time, do, don, than, bought, were, because, quality, fit, easy, used, we, works, more, really, case, little, does, phone, did, small, love, didn, has, back, there, your, got, an, other, size, nice, even, doesn, made, about, came, box, much, buy, also, first, put, now, which, still, looks, two, some, better, price, using, way, am, item, how, need, could, ordered, received, over, return, bag, ve, enough, water, perfect, been, look, worked, disappointed, charge, however, broke, money, then, arrived, day, pretty, again, bit, long, by, screen, make, sure, into, cute, any, months, right, big, its, while, new, battery, last, see, order, fine, keep, thing, recommend, before, plastic, set, side, few, never, shou

In [146]:
for i in range(25, 50):
  print(bert_LR_coef[0][i])   

hose, filter, pump, water, tank, the, filters, suction, to, it, faucet, and, is, of, this, not, that, tube, in, but, one, with, was, be, have, leak, leaks, so, on, for, works, we, has, amazon, as, my, you, gallon, or, product, out, after, like, good, pressure, when, air, do, at, had, would, more, use, up, months, well, replacement, time, great, from, fish, hoses, than, off, only, flow, bought, quality, no, very, part, what, if, top, work, doesn, first, psi, just, new, sink, can, still, used, because, does, working, will, these, return, could, our, pipe, other, gauge, about, worked, over, even, tubes, small, which, company, they, tried, carbon, last, replace, nozzle, are, need, now, customer, piece, kink, brita, head, unit, cup, came, did, change, two, perfectly, shower, brand, much, bottom, hard, there, little, make, leaking, all, leaked, cups, garden, both, easy, made, also, buy, several, days, item, point, gauges, end, don, by, too, who, purchased, attached, plastic, strainer, tanks,

In [147]:
bert_LR_coef

Unnamed: 0,0,1
0,"the, it, to, and, for, is, not, this, of, in, ...",-211.408904
1,"book, story, read, of, the, this, and, was, ch...",184.106869
2,"never, quality, received, product, price, item...",-75.775534
3,"dress, size, small, shirt, it, the, but, and, ...",-63.615768
4,"work, works, it, working, to, not, the, worked...",-215.211418
5,"pillow, very, soft, it, sheets, and, not, the,...",-187.852159
6,"ring, loves, my, for, it, gift, and, she, the,...",201.001476
7,"they, these, shoes, are, them, size, shoe, fee...",-62.124271
8,"taste, smell, flavor, scent, it, like, the, sm...",9.228646
9,"hair, skin, my, it, this, and, to, wig, produc...",-90.855721
