# Evaluating Model Performance 

In [2]:
# Imports 

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import torch
import umap
import hdbscan
import pandas as pd
import numpy as np
import json
from gensim.corpora import Dictionary

from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score



from gensim.models.ldamodel import LdaModel
from gensim import corpora
from scipy.sparse import load_npz

import joblib


## Topic Coherence  

### LDA

In [None]:
lda_topics = []
current_topic = []

with open("../data/lda_results/lda_topics.txt", "r") as f:
    for line in f:
        line = line.strip()
        
        if not line:
            continue

        # Where the topic starts 
        if line.startswith("Topic"):
            if current_topic:
                lda_topics.append(current_topic)
                current_topic = []
            continue

        #skipping the dashed line in txt
        if line.startswith("-"):
            continue

        # extracting the word the first item before each the tab
        word = line.split("\t")[0]
        current_topic.append(word)

#appending the the final topic
if current_topic:
    lda_topics.append(current_topic)

print("First topic sample:", lda_topics[0][:10])  #checks out


First topic sample: ['hair', 'fine', 'iron', 'day', 'place', 'work', 'clips', 'stay', 'flat', 'dryer']


In [5]:
# Tokenizing & creating gensim dictionary

df_5k = pd.read_csv("../data/bert_results/bertopic_5k_reviews_with_topics.csv")  # or your saved CSV
documents = df_5k["text"].astype(str).tolist()
tokenized_docs = [doc.split() for doc in documents]

dictionary = Dictionary(tokenized_docs)

In [6]:
# Getting the topic coherence 

cm_lda = CoherenceModel(
    topics=lda_topics,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
)

lda_coherence = cm_lda.get_coherence()
print("LDA Coherence (c_v):", lda_coherence)


LDA Coherence (c_v): 0.2999875790610845


The LDA model got a coherence score of 0.2999, which is pretty low compared to the other models below. However, this isn't surprising, since LDA relies mostly on word counts and doesn’t really capture the meaning behind what people are saying in the reviews. With beauty products, consumers mostly use varied and subjective language when describing products, so LDA ends up forming topics that are broader and introduces some noise. Although LDA still captured themes, the low coherence score did have challenges with consistently interpretable topics. 

### LSA

In [7]:
#Extracting the topic words again for LSA similar code as LDA 


lsa_topics = []
current_topic = []

with open("../data/lsa_results/lsa_topics.txt", "r") as f:
    for line in f:
        line = line.strip()

        if line.startswith("### LSA Topic"):
            if current_topic:
                lsa_topics.append(current_topic)
                current_topic = []
            continue

        if not line:
            continue

        parts = line.split()
        word = parts[0]
        current_topic.append(word)

if current_topic:
    lsa_topics.append(current_topic)

print("Sample topic words:", lsa_topics[0][:10])


# Computing the coherence for LSA using the dictionary and tokenized documents as above of the 5k review 


cm_lsa = CoherenceModel(
    topics=lsa_topics,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
)

lsa_coherence = cm_lsa.get_coherence()
print("LSA Coherence (c_v):", lsa_coherence)




Sample topic words: ['hair', 'skin', 'work', 'brush', 'color', 'long', 'quality', 'price', 'face', 'dry']
LSA Coherence (c_v): 0.3057626280257974


The LSA model reached a coherence score of 0.3057, which is actually solid since the model is simpler than the other methods that compared the embedding-based approach. Since LSA models capture both frequency and word relationships, even if words they don’t fully belong to the same real-world theme, so this leads to topics feeling a bit broader or slightly noisier. 

Although the score is low, for LSA its a not as bad since it was still able to pick up on some meaningful structure in the reviews, especially around strong, repeated product descriptors like “hair,” “skin,” and “smell.” The overall performance was better than expected but it didn't outperform as well or reach interpretability as the BERTopic did. 

### BERT

In [8]:
# Topic Coherence 

#Topic Coherence will be evaluating:
#How meaningful the top words in each topic are
#If the words tend to co-occur in the same context
#And +  interpretable the topic is to humans.... We want higher = better (.35 - .55) 

#Initial assumption: I believe that Bertopic will outperform LDA/LSA here

# Loading our previous results 

# Truncated 5k review subset. 
df_5k = pd.read_csv("../data/bert_results/bertopic_5k_reviews_with_topics.csv")  # or your saved CSV
documents = df_5k["text"].astype(str).tolist()

#Bert model 
topic_model = BERTopic.load("../data/bert_results/bertopic_model")
print("Loaded BERTopic model.")

#Topic assignments 
topics = df_5k["topic"].tolist()
print("Loaded topics:", len(topics))


Loaded BERTopic model.
Loaded topics: 5000


In [None]:



#Top words for ->  each topic
topics_words = [
    [word for word, _ in topic_model.get_topic(t)]
    for t in set(topics)
]

#Tokenization of the docs. 

tokenized_docs = [doc.split() for doc in documents]
dictionary = Dictionary(tokenized_docs)


#Getting the c_v coherence -> and will later put this into a table to compare. 
cm = CoherenceModel(
    topics=topics_words,
    texts=tokenized_docs,
    dictionary=dictionary,  
    coherence='c_v'
)

coherence_score = cm.get_coherence()
print("BERTopic Coherence for Reviews (c_v):", coherence_score)


BERTopic Coherence for Reviews (c_v): 0.4204005495131143


The BERTopic model achieved a c_v coherence score of **0.4204**, this indicates an above avagera level of semantic consistency with topics that it extracted. In the context of consumer product reviews, where language can highly subjective, and contain a lot of noise in the data, the modeal was able to capture the topics well and with even with text tha can often be informal. Since coherence scores that range between the 0.35–0.55 range are considered good, this result suggests that BERTopic successfully identified interpretable and meaningful themes across our Amazon Beauty reviews. 

This scores and the visuals obtaine earlier show that  the top words in each topic shows strong co-occurrence patterns and shared contextual meaning. Overall, the coherence performance confirms that a transformer-based, embedding-driven approach produces well-structured topics that align closely with human interpretations of review content.    ---> Pair these results with the overview

## Topic Diversity 

### LDA

In [8]:
# Unique Topic Words Ratio
unique_words = set()
total_words = 0
# correction: use lda_topics
for topic_words in lda_topics:
    top_words = topic_words[:10] 
    total_words += len(top_words)
    unique_words.update(top_words)

# Printing the results
lda_topic_diversity = len(unique_words) / total_words
print("LDA Topic Diversity:", lda_topic_diversity)

LDA Topic Diversity: 0.83


The LDA model achieved a really high topic diversity score of 0.83. This shows that the top words across topics are mostly unique and do not repeat very often. This means that LDA is spreading its vocab widely instead of just clustering around the same or similar terms. This can be positive because  the model is capturing  a wide range of themes in the reviews, although this was a hindrance in the performance of earlier in the metrics, this was a sign that the topics are a bit too broad or loosely defined, which lines up with the lower coherence we saw earlier. So while LDA is diverse in and it covers a lot of different words, that doesn’t always translate into cleaner or more meaningful topics compared to the embedding approach like the BERTtopic below.

### LSA 

In [9]:
#Topic Diversity for LSA. 

for topic_words in lsa_topics:
    top_words = topic_words[:10]   
    total_words += len(top_words)
    unique_words.update(top_words)

lsa_topic_diversity = len(unique_words) / total_words
print("LSA Topic Diversity:", lsa_topic_diversity)


LSA Topic Diversity: 0.47


The LSA model reached a topic diversity score of 0.47; this score is in the moderate range, since methods like LSA often end up reusing strong descriptive words and do so across multiple topics, so its themes naturally overlap more than those from LDA. A score such as 0.47 suggests that even though LSA does capture different angles of beauty review data, many of the topics share a similar vocabulary, so this can make them feel less distinct. From the score, we can see that LSA struggled a bit to produce clearly separated and interpretable topics compared to the LDA model.

### BERT

In [9]:
# Topic Diversity 

#High topic diversity = the model is capturing many different themes rather than repeating the same idea in multiple topics.

#Typical ranges: 0.50 & below poor and topic usually repeats Above .70 is good the middle is moderate

#.70–0.90 → High diversity (excellent)
# Initial assumption: I believe that BERTopic will outperform LDA/LSA here as well.

# Using the top 10 words per  -> (This is uniform) 
u_words = set()
t_words = 0

for topic_id in set(topics):
    top_words = [word for word, _ in topic_model.get_topic(topic_id)[:10]]
    t_words += len(top_words)
    u_words.update(top_words)

topic_dei = len(u_words) / t_words
print("BERTopic Topic Diversity:", topic_dei)


BERTopic Topic Diversity: 0.6383561643835617


The model achieved a topic diversity score of  0.638 , which suggest that the topics are fairly distinct from one another and not overly repetitive. Even though it’s not extremely high, it still showcases the model's ability to capture a wide range of themes from the reviews. This level of diversity means users can get more varied insights across topics, even if a few of them slightly overlap in meaning.  ---> Pair these results with the tree

## Silhouette Score (Clustering Performance)

### LDA 

In [None]:
# Using the LDA model 
lda_model = joblib.load("../data/lda_results/lda_model.joblib")
# loading count verctors again 
count_vectors = load_npz("../data/lda_results/count_vectors.npz")

#Converting -> array
X = count_vectors.toarray()

lda_vectors = lda_model.transform(X)   # shape: (n_docs, n_topics)

#Assigning the each doc top strongest topic 
lda_labels = np.argmax(lda_vectors, axis=1)

#Computing the score. 
sil_score = silhouette_score(lda_vectors, lda_labels)

print("LDA Silhouette Score:", sil_score)


LDA Silhouette Score: 0.4563104671340692


The LDA model ended up with a silhouette score of 0.456; the score is actually much higher than we had anticipated. This suggests that the LDA topics were quite well-separated and the documents mostly clustered pretty consistently around their dominant topic. Sometimes LDA can struggle with overlapping language, in this case, it managed to form groups that were distinct enough for the silhouette score to reflect meaningful structure in the reviews. 

### LSA

In [15]:


# Load LSA document-topic vectors
lsa_vectors = load_npz("../data/lsa_results/vectors.npz").toarray()

# Assign each document to the topic with highest weight
lsa_labels = np.argmax(lsa_vectors, axis=1)

# Compute Silhouette Score
lsa_silhouette = silhouette_score(lsa_vectors, lsa_labels)
print("LSA Silhouette Score:", lsa_silhouette)


LSA Silhouette Score: -0.026518421452788203


The LSA model achieved a remarkably low silhouette score of -0.026. At first, it was alarming, but LSA tends to intertwine patterns in the data, so documents didn't get clustered into clean, distinct groups. This is because the distance in the documents is very little, so the topics will not be really distinct but rather overlap each other. LSA isn't necessarily a bad model, but rather it captures wider and more blended topics rather than clean, separated ones. 

### BERTopic

In [8]:
# Import the og embeddings 
#Og Embeddings  Deleted them will add them if needed. 
embeddings = np.load("../cleaned_data/embeddings/embeddings.npy")
#print("Loaded embeddings:", embeddings.shape) 

In [None]:

#extracting the topic labels
labels = np.array(topics)

# Removing any outlier  outliers (-1) -> none from the visuals. but just incase. 
mask = labels != -1
filtered_embeddings = embeddings[mask]
filtered_labels = labels[mask]

print("Original:", len(labels), "Filtered:", len(filtered_labels))

score = silhouette_score(filtered_embeddings, filtered_labels)
print("BERTopic Silhouette Score:", score)

Original: 5000 Filtered: 3692
BERTopic Silhouette Score: 0.05022962763905525


For clustering performance, the BERTopic model gave a silhouette score of 0.05 after removing outliers. This isn’t surprising for beauty review data since people describe products in all kinds of ways, and the themes naturally overlap. So even though the clusters aren’t super tight or clearly separated in the embedding space, the model still manages to pull out meaningful patterns. When you look at this together with the coherence and diversity results, it shows that BERTopic can still capture useful, real-world topics even if the underlying clusters are a bit loose, which honestly reflects how messy human-written reviews usually are.

## Performance Metric Comparison Table. 

## Topic Model Evaluation Comparison

| Model     | Topic Coherence (c_v) | Topic Diversity | Silhouette Score (Cluster Performance) |
|-----------|------------------------|-----------------|----------------------------------------|
| **LDA**        | 0.3000                 | **0.83**           | **0.4563**                               |
| **LSA**        | 0.3058                 | 0.47            | -0.0265                                 |
| **BERTopic**   | **0.4204**             | 0.6384         | 0.0502                                  |
