# Installations

In [114]:
# !pip install datasets
# !pip install pyldavis
# !pip install bertopic

import nltk
import pandas as pd
from nltk.corpus import stopwords as stop_words
from datasets import load_dataset
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import string
from gensim.utils import deaccent
import warnings
from sentence_transformers import SentenceTransformer
import scipy.sparse
from sklearn.preprocessing import OneHotEncoder
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from bertopic import BERTopic

# Variables Changing

In [115]:
is_multi = False
is_combined = not is_multi
perc = 0.10 # 10% of dataset

nltk.download('stopwords')

language = 'en' if is_combined else 'all_languages'
stopwords = list(stop_words.words('english')) if is_combined else list(stop_words.words(stop_words.fileids())) # from every language
embedding_model = 'paraphrase-distilroberta-base-v2' if is_combined else 'paraphrase-multilingual-mpnet-base-v2'

num_epochs_ctm = 20
num_topics = 400
num_topics_word = 500

[nltk_data] Downloading package stopwords to /Users/Arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset

In [116]:
dataset = load_dataset('amazon_reviews_multi', language)

Found cached dataset amazon_reviews_multi (/Users/Arnav/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [117]:
def data(part):
  data = dataset[part].to_pandas()
  review_body = list(data['review_body'])
  return data, review_body

train, review_body = data('train')
test, review_body_test = data('test')
validation, review_body_validation = data('validation')

def make_review_body(data, perc):
  arr = sklearn.utils.shuffle(np.arange(len(data)), random_state=42)[0:int(len(data) * perc)]
  temp = data.iloc[arr].reset_index(drop=True)
  review_body = list(temp['review_body'])
  temp.reset_index(drop=True, inplace=True)
  temp = temp.rename(columns = {'index': 'message_id', 'review_body': 'message'})
  return temp, review_body

temp_train, review_body = make_review_body(train, perc)
temp_test, test_review_body = make_review_body(test, 1)
temp_validation, validation_review_body = make_review_body(validation, 1)

test = temp_test
test['review_body'] = test_review_body
temp_test['review_body'] = test_review_body

# Preprocessing


In [118]:
class WhiteSpacePreprocessingStopwords():
    """
    Provides a very simple preprocessing script that filters infrequent tokens from text
    """

    def __init__(self, documents, stopwords_list=None, vocabulary_size=2000, max_df=1.0, min_words=1,
                 remove_numbers=True):
        """

        :param documents: list of strings
        :param stopwords_list: list of the stopwords to remove
        :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
        :param max_df : float or int, default=1.0
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float in range [0.0, 1.0], the parameter represents a proportion of
        documents, integer absolute counts.
        This parameter is ignored if vocabulary is not None.
        :param min_words: int, default=1. Documents with less words than the parameter
        will be removed
        :param remove_numbers: bool, default=True. If true, numbers are removed from docs
        """
        self.documents = documents
        if stopwords_list is not None:
            self.stopwords = set(stopwords_list)
        else:
            self.stopwords = []

        self.vocabulary_size = vocabulary_size
        self.max_df = max_df
        self.min_words = min_words
        self.remove_numbers = remove_numbers

    def preprocess(self):
        """
        Note that if after filtering some documents do not contain words we remove them. That is why we return also the
        list of unpreprocessed documents.

        :return: preprocessed documents, unpreprocessed documents and the vocabulary list
        """
        preprocessed_docs_tmp = self.documents
        preprocessed_docs_tmp = [deaccent(doc.lower()) for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [doc.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
        if self.remove_numbers:
            preprocessed_docs_tmp = [doc.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
                                     for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
                                 for doc in preprocessed_docs_tmp]

        vectorizer = CountVectorizer(max_features=self.vocabulary_size, max_df=self.max_df)
        vectorizer.fit_transform(preprocessed_docs_tmp)
        temp_vocabulary = set(vectorizer.get_feature_names_out())

        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                                 for doc in preprocessed_docs_tmp]

        preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
        for i, doc in enumerate(preprocessed_docs_tmp):
            if len(doc) > 0 and len(doc) >= self.min_words:
                preprocessed_docs.append(doc)
                unpreprocessed_docs.append(self.documents[i])
                retained_indices.append(i)

        vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

        return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices


# TopicModelDataPreparation

In [119]:
def get_bag_of_words(data, min_length):
    """
    Creates the bag of words
    """
    vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length)
            for x in data if np.sum(x[x != np.array(None)]) != 0]

    vect = scipy.sparse.csr_matrix(vect)
    return vect


def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200, max_seq_length=None):
    """
    Creates SBERT Embeddings from an input file, assumes one document per line
    """

    model = SentenceTransformer(sbert_model_to_load)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length

    with open(text_file, encoding="utf-8") as filino:
        texts = list(map(lambda x: x, filino.readlines()))

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))


def bert_embeddings_from_list(texts, sbert_model_to_load, batch_size=200, max_seq_length=None):
    """
    Creates SBERT Embeddings from a list
    """
    model = SentenceTransformer(sbert_model_to_load)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))


def check_max_local_length(max_seq_length, texts):
    max_local_length = np.max([len(t.split()) for t in texts])
    if max_local_length > max_seq_length:
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn(f"the longest document in your collection has {max_local_length} words, the model instead "
                      f"truncates to {max_seq_length} tokens.")


class TopicModelDataPreparation:

    def __init__(self, contextualized_model=None, show_warning=True, max_seq_length=128):
        self.contextualized_model = contextualized_model
        self.vocab = []
        self.id2token = {}
        self.vectorizer = None
        self.label_encoder = None
        self.show_warning = show_warning
        self.max_seq_length = max_seq_length

    def load(self, contextualized_embeddings, bow_embeddings, id2token, labels=None):
        return CTMDataset(
            X_contextual=contextualized_embeddings, X_bow=bow_embeddings, idx2token=id2token, labels=labels)

    def fit(self, text_for_contextual, text_for_bow, labels=None, custom_embeddings=None):
        """
        This method fits the vectorizer and gets the embeddings from the contextual model

        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param custom_embeddings: np.ndarray type object to use custom embeddings (optional).
        :param labels: list of labels associated with each document (optional).
        """

        if custom_embeddings is not None:
            assert len(text_for_contextual) == len(custom_embeddings)

            if text_for_bow is not None:
                assert len(custom_embeddings) == len(text_for_bow)

            if type(custom_embeddings).__module__ != 'numpy':
                raise TypeError("contextualized_embeddings must be a numpy.ndarray type object")

        if text_for_bow is not None:
            assert len(text_for_contextual) == len(text_for_bow)

        if self.contextualized_model is None and custom_embeddings is None:
            raise Exception("A contextualized model or contextualized embeddings must be defined")

        # TODO: this count vectorizer removes tokens that have len = 1, might be unexpected for the users
        self.vectorizer = CountVectorizer()

        train_bow_embeddings = self.vectorizer.fit_transform(text_for_bow)

        # if the user is passing custom embeddings we don't need to create the embeddings using the model

        if custom_embeddings is None:
            train_contextualized_embeddings = bert_embeddings_from_list(
                text_for_contextual, sbert_model_to_load=self.contextualized_model, max_seq_length=self.max_seq_length)
        else:
            train_contextualized_embeddings = custom_embeddings
        self.vocab = self.vectorizer.get_feature_names_out()
        self.id2token = {k: v for k, v in zip(range(0, len(self.vocab)), self.vocab)}

        if labels:
            self.label_encoder = OneHotEncoder()
            encoded_labels = self.label_encoder.fit_transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None
        return CTMDataset(
            X_contextual=train_contextualized_embeddings, X_bow=train_bow_embeddings,
            idx2token=self.id2token, labels=encoded_labels)

    def transform(self, text_for_contextual, text_for_bow=None, custom_embeddings=None, labels=None):
        """
        This method create the input for the prediction. Essentially, it creates the embeddings with the contextualized
        model of choice and with trained vectorizer.

        If text_for_bow is missing, it should be because we are using ZeroShotTM

        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param custom_embeddings: np.ndarray type object to use custom embeddings (optional).
        :param labels: list of labels associated with each document (optional).
        """

        if custom_embeddings is not None:
            assert len(text_for_contextual) == len(custom_embeddings)

            if text_for_bow is not None:
                assert len(custom_embeddings) == len(text_for_bow)

        if text_for_bow is not None:
            assert len(text_for_contextual) == len(text_for_bow)

        if self.contextualized_model is None:
            raise Exception("You should define a contextualized model if you want to create the embeddings")

        if text_for_bow is not None:
            test_bow_embeddings = self.vectorizer.transform(text_for_bow)
        else:
            # dummy matrix
            if self.show_warning:
                warnings.simplefilter('always', DeprecationWarning)
                warnings.warn(
                    "The method did not have in input the text_for_bow parameter. This IS EXPECTED if you "
                    "are using ZeroShotTM in a cross-lingual setting")

            # we just need an object that is matrix-like so that pytorch does not complain
            test_bow_embeddings = scipy.sparse.csr_matrix(np.zeros((len(text_for_contextual), 1)))

        if custom_embeddings is None:
            test_contextualized_embeddings = bert_embeddings_from_list(
                text_for_contextual, sbert_model_to_load=self.contextualized_model, max_seq_length=self.max_seq_length)
        else:
            test_contextualized_embeddings = custom_embeddings

        if labels:
            encoded_labels = self.label_encoder.transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None

        return CTMDataset(X_contextual=test_contextualized_embeddings, X_bow=test_bow_embeddings,
                          idx2token=self.id2token, labels=encoded_labels)


# Modeling

In [120]:
documents = [line.strip() for line in review_body + test_review_body]
test_documents = [line.strip() for line in test_review_body]

sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()
labels = pd.concat([temp_train, temp_test]).reset_index()['stars'][retained_indices]
temp_test.reset_index(drop=True, inplace=True)

In [121]:
# PARAMS
# data: testing dataset

# RETURN
# preprocessed_documents: documents after passed through preprocessing steps
# labels: corresponding labels to ^

def test_ready(data):
  documents = [line.strip() for line in data['review_body']]
  sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords)
  preprocessed_documents, unpreprocessed_corpus, _, retained_indices = sp.preprocess()
  labels = temp_test['stars'][retained_indices]
  # labels = temp_test['age'][retained_indices] # twitter
  return unpreprocessed_corpus, preprocessed_documents, labels

test_unpreprocessed_documents, test_preprocessed_documents, test_labels = test_ready(temp_test)

## BERTopic

In [122]:
bert_topic_model = BERTopic(top_n_words = num_topics_word, nr_topics=num_topics).fit(documents + test_documents) if language!='en' else BERTopic(language = 'English', top_n_words = num_topics_word, nr_topics=num_topics).fit(documents + test_documents)

# English takes 4 mins
# Multi takes 53 mins

In [123]:
mallet_stopwords = []
loc3 = "dlatk/amazon_eng/9f822b_stopwords" if is_combined else "dlatk/amazon_multi/ac0935_stopwords"
with open(loc3) as f:
  for line in f:
    mallet_stopwords.append(line.strip())

In [124]:
bert_topic_loglik = [{i: j for (i, j) in bert_topic_model.get_topic(k-1) if i not in mallet_stopwords} for k in range(num_topics)]
bert_topic_topics = [list(i.keys()) for i in bert_topic_loglik]

bert_topic_loglik.append({i: 1.0 for i in mallet_stopwords})
bert_topic_topics.append(mallet_stopwords)

In [125]:
for sublist in bert_topic_topics:
  print(" ".join([i for i in sublist if i not in mallet_stopwords]))

don box doesn made didn came ordered some light looks worked still now which also better first perfect need item two return phone case am working put day way any charge using order then how over right keep battery could enough ve cute months been money disappointed received again however fine big water into purchased our old amazon thing arrived long around bag though pretty make its sure hard cheap come different hold able broke something by side while look tried new before purchase going wear recommend year bit see set never won package go open think turn since another few super should needed happy top last seems couple large want far times take broken try material re easily unit already usb plastic expected reviews know picture their looking lot part worth problem both definitely down getting days together stay makes plug same clean once he charger screen batteries returned where thought being say lights weeks longer design wish month issue color piece give gift stopped charging thr

# Evaluating

In [126]:
# PARAMS
# documents: list of documents (each element in the list is a string) that the topics were extracted from
# topics: list of list of topics from the model of choice
# weights: list of dictionary mappings (word: weight)

# RETURN
# topic distribution

def preprocess(documents, topics, weights):
    # Initialize distribution matrix
    distribution = np.zeros((len(documents), len(topics)))

    for i, document in enumerate(documents):
        # Preprocess document
        document = document.translate(str.maketrans('', '', string.punctuation)) # removing periods, commas, etc
        document = document.split(' ') # split on spaces
        document = [word.lower() for word in document if len(word.lower()) > 0] # lower case everything since all topics are lower case

        for j, loglik_dict in enumerate(weights):
            distribution[i][j] = np.sum([0 if word not in loglik_dict else loglik_dict[word] for word in document]) # if word exists then its weight else 0

        # Normalize
        distribution[i] /= (len(document) + 2) # +2 for some weird reason ?
        
    return distribution


# PARAMS
# X: distribution from preprocess() above
# y: stars ('labels')

# RETURN
# LR model

def train_regression_model(X, y):
    model = LinearRegression()
    # model = LogisticRegression()
    model.fit(X, y)
    return model


# PARAMS
# model: model trained from train_regression_model()
# X_test: distribution you want to make predictions on
# y_test: the true labels for the X_test passed in

# RETURN
# MSE: MSE of the (X_test, y_test) data inputted
# RMSE: RMSE of the (X_test, y_test) data inputted
# R2: R2 of the (X_test, y_test) data inputted
# MAE: MAE of the (X_test, y_test) data inputted

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    # from sklearn.metrics import accuracy_score
    # return accuracy_score(y_test, y_pred)

    mse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=True)
    rmse = mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)
    r2 = r2_score(y_true=y_test, y_pred=y_pred)
    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)

    return mse, rmse, r2, mae


# PARAMS
# topics: list of list of topics
# weights: list of dictionary mappings (word: weight)
# flag: True if baseline (default is False

# RETURN
# r2_train: R2 on train set
# mae_train: MAE on train set
# mse_train: MSE on train set
# rmse_train: RMSE on train set
# r2_test: R2 on test set
# mae_test: MAE on test set
# mse_test: MSE on test set
# rmse_test: RMSE on test set

def results(topics, weights, flag=False):
  if flag:
    X = np.array([labels.mean()] * len(labels)).reshape(-1, 1)
    X_test = np.array([test_labels.mean()] * len(test_labels)).reshape(-1, 1)

  else:
    X = preprocess([line.strip() for line in review_body], topics, weights)
    X_test = preprocess([line.strip() for line in test_review_body], topics, weights)

  y = temp_train['stars']
  y_test = temp_test['stars']
  # y = temp_train['age']
  # y_test = temp_test['age']
  model = train_regression_model(X, y)

  mse_test, rmse_test, r2_test, mae_test = evaluate_model(model, X_test, y_test)
  mse_train, rmse_train, r2_train, mae_train = evaluate_model(model, X, y)

  # train_acc = evaluate_model(model, X, y)
  # test_acc = evaluate_model(model, X_test, y_test)

  # print("Train Accuracy: ", train_acc)
  # print("Test Accuracy: ", test_acc)

  # return train_acc, test_acc

  print("Train")
  print("R2:", r2_train)
  print("MAE:", mae_train)
  print("MSE:", mse_train)
  print("RMSE:", rmse_train)

  print()

  print("Test")
  print("R2:", r2_test)
  print("MAE:", mae_test)
  print("MSE:", mse_test)
  print("RMSE:", rmse_test)

  return r2_train, mae_train, mse_train, rmse_train, r2_test, mae_test, mse_test, rmse_test

## BERTopic

In [127]:
bert_topic_distribution_train = preprocess([line.strip() for line in review_body + test_review_body], bert_topic_topics, bert_topic_loglik)

In [128]:
bert_r2_train, bert_mae_train, bert_mse_train, bert_rmse_train, bert_r2_test, bert_mae_test, bert_mse_test, bert_rmse_test = results(bert_topic_topics, bert_topic_loglik)

Train
R2: 0.34514832227857084
MAE: 0.9541980408746576
MSE: 1.3202704202914155
RMSE: 1.1490302086069868

Test
R2: 0.30292655413296565
MAE: 0.9828384184342304
MSE: 1.3941468917340687
RMSE: 1.1807399763428308


In [129]:
pd.concat([pd.Series([bert_r2_train, bert_mae_train, bert_mse_train, bert_rmse_train, bert_r2_test, bert_mae_test, bert_mse_test, bert_rmse_test])], axis = 1).T

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.345148,0.954198,1.32027,1.14903,0.302927,0.982838,1.394147,1.18074
