In [76]:

import os
import json
from pathlib import Path

import boto3
import numpy as np
from scipy import sparse
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# S3 config
BUCKET_NAME = os.environ.get("BUCKET_NAME", "amazonreviewsnlp")
VECTORS_KEY = "amazon_vectors/countvectorizer/vectors.npz"
VOCAB_KEY   = "amazon_vectors/countvectorizer/vocab.json"

# Local output directory for LDA results
RESULTS_DIR = Path("../data/lda_results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

s3 = boto3.client("s3")
print("Using bucket:", BUCKET_NAME)
print("Saving results to:", RESULTS_DIR.resolve())


Using bucket: amazonreviewsnlp
Saving results to: /home/ubuntu/Topic_Modeling_on_Amazon_Reviews-/data/lda_results


In [77]:
# !pip install boto3 numpy scipy scikit-learn sentence-transformers wordcloud matplotlib ipykernel


In [78]:

local_vectors_path = RESULTS_DIR / "count_vectors.npz"
local_vocab_path   = RESULTS_DIR / "vocab.json"

print("Downloading count vectors from S3...")
s3.download_file(BUCKET_NAME, VECTORS_KEY, str(local_vectors_path))

print("Downloading vocabulary from S3...")
s3.download_file(BUCKET_NAME, VOCAB_KEY, str(local_vocab_path))

print("Done.")


Downloading count vectors from S3...
Downloading vocabulary from S3...
Done.


In [None]:

from scipy import sparse

print("Loading sparse matrix...")
X = sparse.load_npz(local_vectors_path)
print("Matrix shape:", X.shape)

with open(local_vocab_path) as f:
    vocab = json.load(f)
inv_vocab = {int(idx): term for term, idx in vocab.items()}

feature_names = [inv_vocab[i] for i in range(len(inv_vocab))]

print("Vocab size:", len(feature_names))
print("Sample terms:", feature_names[:20])


Loading sparse matrix...
Matrix shape: (10000, 1756)
Vocab size: 1756
Sample terms: ['ability', 'able', 'absolutely', 'absorb', 'absorbed', 'absorbed skin', 'absorbs', 'absorbs quickly', 'accessories', 'accidentally', 'accurate', 'acid', 'acne', 'acrylic', 'action', 'actual', 'actually', 'add', 'added', 'adding']


In [None]:


n_topics = 10    
max_iter = 10    
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=max_iter,
    learning_method="batch",
    random_state=42,
    n_jobs=-1,            
)

print("Fitting LDA with", n_topics, "topics...")
lda.fit(X)
print("Done.")


Fitting LDA with 10 topics...
Done.


In [None]:

n_top_words = 20
topics_txt_path = RESULTS_DIR / "lda_topics.txt"

def get_topic_terms(lda_model, feature_names, n_top_words=20):
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_terms = [(feature_names[i], float(topic[i])) for i in top_indices]
        topics.append(top_terms)
    return topics

topics = get_topic_terms(lda, feature_names, n_top_words=n_top_words)

with open(topics_txt_path, "w") as f:
    for k, topic in enumerate(topics):
        f.write(f"Topic {k}\n")
        f.write("--------------------\n")
        for term, weight in topic:
            f.write(f"{term}\t{weight:.4f}\n")
        f.write("\n")

print("Saved topics to:", topics_txt_path.resolve())


Saved topics to: /home/ubuntu/Topic_Modeling_on_Amazon_Reviews-/data/lda_results/lda_topics.txt


In [None]:

wordcloud_dir = RESULTS_DIR / "wordclouds"
wordcloud_dir.mkdir(exist_ok=True)

for k, topic in enumerate(topics):
    freq_dict = {term: weight for term, weight in topic}

    wc = WordCloud(
        width=800,
        height=400,
        background_color="white"
    ).generate_from_frequencies(freq_dict)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Topic {k}", fontsize=16)

    out_path = wordcloud_dir / f"topic_{k}.png"
    plt.savefig(out_path, bbox_inches="tight")
    plt.close()

    print(f"Saved wordcloud for Topic {k} to {out_path}")


Saved wordcloud for Topic 0 to ../data/lda_results/wordclouds/topic_0.png
Saved wordcloud for Topic 1 to ../data/lda_results/wordclouds/topic_1.png
Saved wordcloud for Topic 2 to ../data/lda_results/wordclouds/topic_2.png
Saved wordcloud for Topic 3 to ../data/lda_results/wordclouds/topic_3.png
Saved wordcloud for Topic 4 to ../data/lda_results/wordclouds/topic_4.png
Saved wordcloud for Topic 5 to ../data/lda_results/wordclouds/topic_5.png
Saved wordcloud for Topic 6 to ../data/lda_results/wordclouds/topic_6.png
Saved wordcloud for Topic 7 to ../data/lda_results/wordclouds/topic_7.png
Saved wordcloud for Topic 8 to ../data/lda_results/wordclouds/topic_8.png
Saved wordcloud for Topic 9 to ../data/lda_results/wordclouds/topic_9.png
