In [2]:
from bertopic import BERTopic
from bertopic.representation import ZeroShotClassification
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import spacy
from huggingface_hub import login

# Set up Plotly renderer
pio.renderers.default = 'iframe'

# Set up environment variables for AWS
AWS_DEFAULT_REGION = os.environ["AWS_DEFAULT_REGION"]
AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]

# Load Spacy model for lemmatization
nlp = spacy.load('en_core_web_sm')

# Function for lemmatization
def lemmatization(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)    
    return lemmatized_text


In [None]:
# Load data from the mounted volume (Replace with your data path)
data_path = '/review_classification_project/alarmy_reviews.csv'
data = pd.read_csv(data_path)
print(f"Data Loaded. Shape: {data.shape}")


In [None]:
# Drop rows with missing content
data = data.dropna(subset=['text'])

# Apply lemmatization
data['preprocessed_content'] = data['text'].apply(lemmatization)

# Drop rows with missing lemmatized content
data = data.dropna(subset=['preprocessed_content'])
print(f"Data Preprocessed. Shape after cleaning: {data.shape}")

In [None]:
# Define a list of known topics for zero-shot classification
zeroshot_topic_list = [
    "android", "premium", "ads", "math", "subscription", "update", "camera", 
    "shake", "weather", 'snooze', 'loud', 'doesn', 'off'
]

# Prepare documents for BERTopic
docs = list(data.preprocessed_content.values)

# Set up ZeroShotClassification model
representation_model = ZeroShotClassification(zeroshot_topic_list, model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

# Create and train the BERTopic model
topic_model = BERTopic(
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=.85,
    representation_model=representation_model,
    nr_topics=50
)

# Fit the BERTopic model on the documents
topics, probs = topic_model.fit_transform(docs)
print("Topic modeling completed.")

In [None]:
# Get topic information
topic_info = topic_model.get_topic_info()
display(topic_info)

# Visualize top topics as a bar chart
fig = topic_model.visualize_barchart(top_n_topics=60, n_words=10)
fig.show()

In [None]:
# Choose a specific topic number to analyze
topic_number = 25

# Display top words for the selected topic
print(f"Top words for topic {topic_number}:")
print(topic_model.get_topic(topic_number))

# Extract 10 example reviews for the chosen topic
topic_indices = [i for i, t in enumerate(topics) if t == topic_number]
example_reviews = [docs[i] for i in topic_indices[:10]]
print(f"\n10 Example Reviews for Topic {topic_number}:\n")
for idx, review in enumerate(example_reviews, 1):
    print(f"Review {idx}: {review}\n")


In [None]:
# Calculate topic distances using cosine similarity
distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

# Extract pairwise topic distances and sort
tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append({'topic1': t1, 'topic2': t2, 'distance': rec[t2]})

pair_dist_df = pd.DataFrame(tmp)
pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(lambda x: not x.startswith('-1'))) & 
                            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending=False).head(50)

# Merge closely related topics
topic_model.merge_topics(docs, [[10, 3, 37], [2, 13, 15, 19], [12, 30, 25], 
                                [4, 5, 7, 20, 21, 27, 28, 29, 33, 36, 43, 44, 46], 
                                [1, 17, 31, 34, 40], [9, 26, 41], [18, 40]])

In [None]:
# Set human-readable topic labels
topic_labels_dict = {
    0: "Math", 1: "Sometimes not Ringing", 2: "Good App", 3: "Premium Subscription",
    4: "Loud", 5: "Take Photo", 6: "Snooze", 7: "Easy to Use", 8: "Barcode Scanner",
    9: "Update", 10: "Shake Mission", 11: "Horoscope/News", 12: "Overheating", 13: "Storage Size", 14: "Challenges"
}
topic_model.set_topic_labels(topic_labels_dict)

# Save the model to HuggingFace
login(os.getenv("HUGGINGFACE_TOKEN"))
topic_model.push_to_hf_hub(repo_id="DobreMihai/bertopic_ready_labeled", save_ctfidf=True)