# ðŸ““ Notebook 2: Topic Modeling (Customer Voice)

**Goal:** Discover themes in Reviews (Size, Fit, Material) and Support data (Shipping, Billing, Tech Support).
**Input:** `all_chunks.parquet`, `embeddings.npy`

In [None]:
!pip install -q bertopic umap-learn hdbscan plotly

In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import os

# â”€â”€â”€ 1. LOAD DATA â”€â”€â”€

# Adjust path to where you uploaded the dataset/output from NB1
INPUT_DIR = "/kaggle/input/customer-voice-processed-nb1"  
OUTPUT_DIR = "/kaggle/working"

# Fallback check
if not os.path.exists(INPUT_DIR):
    # Often Kaggle just mounts it at /kaggle/input/dataset-name
    # Try current directory first if running linearly
    if os.path.exists("/kaggle/working/all_chunks.parquet"):
        INPUT_DIR = "/kaggle/working"
    else:
        print("Warning: Input dataset not found. Please check paths.")

try:
    df = pd.read_parquet(f"{INPUT_DIR}/all_chunks.parquet")
    embeddings = np.load(f"{INPUT_DIR}/embeddings.npy")
    docs = df['text'].tolist()
    print(f"Loaded {len(docs)} items.")
except Exception as e:
    print(f"Data Load Error: {e}")

In [None]:
# â”€â”€â”€ 2. CONFIGURE MODEL â”€â”€â”€
# Custom seeds for e-commerce & support
seeds = [
    # Product Issues (Reviews)
    ["wrong size", "too small", "too large", "fit", "sizing"],
    ["material", "fabric", "quality", "texture", "cheap"],
    ["color", "design", "style", "look", "pattern"],
    # Support Issues (Tweets)
    ["shipping", "delivery", "late", "package", "tracking"],
    ["refund", "return", "exchange", "charge", "billing"],
    ["app code", "login", "password", "crash", "error"]
]

umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', prediction_data=True)

topic_model = BERTopic(
    embedding_model="sentence-transformers/all-mpnet-base-v2",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    seed_topic_list=seeds,
    verbose=True
)

In [None]:
# â”€â”€â”€ 3. TRAIN â”€â”€â”€
# Subsample if too large (>100k)
if len(docs) > 100000:
    print("Subsampling for training (first 50k)...")
    topic_model.fit(docs[:50000], embeddings=embeddings[:50000])
    topics, probs = topic_model.transform(docs, embeddings=embeddings)
else:
    topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)

freq = topic_model.get_topic_info()
print(freq.head(10))

In [None]:
# â”€â”€â”€ 4. EXPORT â”€â”€â”€
topic_model.save("/kaggle/working/bertopic_model", serialization="safetensors", save_ctfidf=True)
df['topic'] = topics
df.to_parquet("/kaggle/working/chunks_with_topics.parquet")

# Visualization
try:
    fig = topic_model.visualize_topics()
    fig.write_html("/kaggle/working/topic_map.html")
    fig.show()
except Exception as e:
    print(f"Viz error: {e}")