### Cleaning

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
import pandas as pd

# Load the raw data you collected
try:
    df = pd.read_csv('reddit_gpt5_comments.csv')
    print(f"✅ Loaded {len(df)} raw comments.")
except FileNotFoundError:
    print("❌ Error: 'reddit_gpt5_comments.csv' not found. Please upload the raw scraped file.")
    # Exit gracefully if the file isn't there
    df = pd.DataFrame()

if not df.empty:
    # --- Filter 1: Remove moderator posts and summaries ---
    filter_phrases = [
        "This AMA has been verified",
        "Questions & Answers by OpenAI team",
        "Source thread:",
        "I asked ChatGPT Agent to create this comment",
        "will start at"
    ]
    pattern = '|'.join(filter_phrases)
    df_cleaned = df[~df['comment_text'].str.contains(pattern, case=False, na=False)]
    print(f"🔍 Comments after removing mod/summary posts: {len(df_cleaned)}")

    # --- Filter 2: Remove very short comments ---
    min_word_count = 10
    df_cleaned = df_cleaned[df_cleaned['comment_text'].str.split().str.len() > min_word_count]
    print(f"🧹 Comments after removing short posts: {len(df_cleaned)}")

    # --- Final Step: Save the Cleaned Data ---
    output_file = 'reddit_gpt5_comments_cleaned.csv'
    df_cleaned.to_csv(output_file, index=False)

    print(f"\n✅ Success! Basic cleaning complete. Saved {len(df_cleaned)} comments to '{output_file}'")

✅ Loaded 5306 raw comments.
🔍 Comments after removing mod/summary posts: 5305
🧹 Comments after removing short posts: 4079

✅ Success! Basic cleaning complete. Saved 4079 comments to 'reddit_gpt5_comments_cleaned.csv'


In [None]:
import pandas as pd
from transformers import pipeline
import torch

# --- 1. Load your cleaned data ---
try:
    df = pd.read_csv('reddit_gpt5_comments_cleaned.csv')
    comments = df['comment_text'].tolist()
    print(f"Loaded {len(comments)} cleaned comments to filter.")
except FileNotFoundError:
    print("Please upload 'reddit_gpt5_comments_cleaned.csv' first.")
    comments = []

# --- 2. Set up the Zero-Shot Classification Pipeline ---
# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

# Initialize the pipeline with the CORRECTED model name.
# This will download the model the first time.
classifier = pipeline("zero-shot-classification",
                      model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli", # <-- CORRECTED NAME
                      device=device)

# --- 3. Define Labels and Classify ---
# The model will determine which of these labels is a better fit for each comment.
candidate_labels = ["user complaint or pain point", "neutral or positive comment"]

# Run the classification on all comments.
# This is the main processing step and will be much faster on a GPU.
if comments:
    print(f"Starting classification for {len(comments)} comments...")
    # We use a generator and batching for better memory efficiency
    results = classifier(comments, candidate_labels, multi_label=False, batch_size=8)
    print("Classification complete.")

    # --- 4. Filter and Save the Pain Points ---
    pain_points = []
    # The result for each comment contains labels and scores.
    # We check if the top-scoring label is the one we're looking for.
    for i, res in enumerate(results):
        if res['labels'][0] == "user complaint or pain point":
            pain_points.append(comments[i])

    # Create a new DataFrame and save it
    df_pain_points = pd.DataFrame(pain_points, columns=['comment_text'])
    output_file = 'reddit_gpt5_pain_points.csv'
    df_pain_points.to_csv(output_file, index=False)

    print(f"\n✅ Success! Identified {len(pain_points)} pain points.")
    print(f"Filtered data saved to '{output_file}'.")

Loaded 4079 cleaned comments to filter.
Using device: GPU


Device set to use cuda:0


Starting classification for 4079 comments...
Classification complete.

✅ Success! Identified 2585 pain points.
Filtered data saved to 'reddit_gpt5_pain_points.csv'.


### Clustering Pipeline

In [None]:
# Install all required libraries
!pip install sentence-transformers umap-learn hdbscan pandas

# Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import umap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
import hdbscan
import os
import csv
print("Libraries installed and imported successfully.")

Libraries installed and imported successfully.


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
def get_embeddings(comments_list, model_name, cache_dir='embeddings_cache'):
    """
    Generates or loads cached embeddings for a list of comments.

    Args:
        comments_list (list): A list of strings (comments).
        model_name (str): The name of the SentenceTransformer model from Hugging Face.
        cache_dir (str): The directory to save/load cached embeddings.

    Returns:
        np.ndarray: The embedding vectors.
    """
    # Create a unique filename for the cached embeddings
    sanitized_model_name = model_name.replace('/', '_')
    cache_path = os.path.join(cache_dir, f'{sanitized_model_name}.npy')

    # Create the cache directory if it doesn't exist
    os.makedirs(cache_dir, exist_ok=True)

    if os.path.exists(cache_path):
        print(f"Loading cached embeddings from {cache_path}...")
        embeddings = np.load(cache_path)
    else:
        print(f"Cache not found. Generating embeddings with '{model_name}'...")
        # Load the model (this will download it on the first run)
        model = SentenceTransformer(model_name)
        # Generate embeddings
        embeddings = model.encode(comments_list, show_progress_bar=True)
        # Save the embeddings to the cache file
        np.save(cache_path, embeddings)
        print(f"Embeddings saved to {cache_path}.")

    return embeddings

In [None]:
def reduce_dimensions(embeddings, method='pca', n_components=50):
    """
    Reduces the dimensionality of embeddings using PCA or UMAP.

    Args:
        embeddings (np.ndarray): The high-dimensional embedding vectors.
        method (str): The reduction method ('pca' or 'umap').
        n_components (int): The number of dimensions to reduce to.

    Returns:
        np.ndarray: The reduced-dimension vectors.
    """
    print(f"Reducing dimensions using {method.upper()} to {n_components} components...")
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'umap':
        reducer = umap.UMAP(n_components=n_components, n_neighbors=15, min_dist=0.1, random_state=42)
    else:
        raise ValueError("Method must be 'pca' or 'umap'")

    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

In [None]:
def perform_clustering(embeddings, method='kmeans', **kwargs):
    """
    Performs clustering on the embeddings.

    Args:
        embeddings (np.ndarray): The vectors to cluster (can be full or reduced).
        method (str): The clustering algorithm ('kmeans', 'dbscan', or 'hdbscan').
        **kwargs: Algorithm-specific parameters (e.g., n_clusters for KMeans).

    Returns:
        np.ndarray: An array of cluster labels.
    """
    print(f"Performing clustering with {method.upper()}...")
    if method == 'kmeans':
        # Example: n_clusters is a required argument for KMeans
        clusterer = KMeans(n_clusters=kwargs.get('n_clusters', 8), random_state=42)
    elif method == 'dbscan':
        # Example: eps and min_samples are key parameters for DBSCAN
        clusterer = DBSCAN(eps=kwargs.get('eps', 0.5), min_samples=kwargs.get('min_samples', 5))
    elif method == 'hdbscan':
        clusterer = hdbscan.HDBSCAN(min_cluster_size=kwargs.get('min_cluster_size', 10),
                                      min_samples=kwargs.get('min_samples', 1))
    else:
        raise ValueError("Method must be 'kmeans', 'dbscan', or 'hdbscan'")

    labels = clusterer.fit_predict(embeddings)
    return labels

### Experimentation

In [None]:
# --- 1. Load your final "pain points" data ---
try:
    df = pd.read_csv('reddit_gpt5_pain_points.csv')
    comments = df['comment_text'].tolist()
    print(f"✅ Loaded {len(comments)} pain point comments for clustering.")
except FileNotFoundError:
    print("❌ Error: 'reddit_gpt5_pain_points.csv' not found.")
    comments = []

# --- 2. Define your experiments ---
# Using the two recommended models for your experiment
embedding_models_to_test = ['BAAI/bge-large-en-v1.5', 'all-MiniLM-L6-v2']
reduction_methods_to_test = ['pca', 'umap']
clustering_methods_to_test = ['kmeans', 'dbscan', 'hdbscan']

❌ Error: 'reddit_gpt5_pain_points.csv' not found.


In [None]:

# 3. Setup for logging results
results_dir = 'cluster_results'
summary_file = os.path.join(results_dir, 'grid_search_summary.csv')
os.makedirs(results_dir, exist_ok=True)

# Create header for the summary file if it doesn't exist
if not os.path.exists(summary_file):
    with open(summary_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['experiment_name', 'embedding_model', 'reducer', 'clusterer',
                         'num_clusters', 'noise_points', 'cluster_sizes'])

# --- 4. Run the full pipeline for all combinations ---
if comments:
    for model_name in embedding_models_to_test:
        current_embeddings = get_embeddings(comments, model_name)

        for red_method in reduction_methods_to_test:
            n_components = 50 if red_method == 'pca' else 10
            reduced_embeds = reduce_dimensions(current_embeddings, method=red_method, n_components=n_components)

            for clus_method in clustering_methods_to_test:
                experiment_name = f"{model_name.split('/')[-1]}__{red_method}__{clus_method}"
                result_path = os.path.join(results_dir, f'{experiment_name}_labels.npy')

                if os.path.exists(result_path):
                    print(f"✅ Skipping experiment '{experiment_name}', result already exists.")
                    continue

                print(f"\n--- RUNNING EXPERIMENT: {experiment_name} ---")

                params = {}
                if clus_method == 'kmeans':
                    params['n_clusters'] = 8 # Your fixed K for the benchmark

                labels = perform_clustering(reduced_embeds, method=clus_method, **params)
                np.save(result_path, labels) # Save raw labels

                               # --- Calculate and Log Summary ---
                unique_labels, counts = np.unique(labels, return_counts=True)
                cluster_sizes = dict(zip(unique_labels, counts))
                num_clusters = len(unique_labels)
                noise_points = cluster_sizes.get(-1, 0) # Get count of -1 label, default to 0
                if -1 in cluster_sizes:
                    num_clusters -= 1 # Don't count noise as a "cluster"

                # --- NEW LINE: Clean the dictionary for prettier logging ---
                clean_cluster_sizes_str = str({int(k): int(v) for k, v in cluster_sizes.items()})

                # Append summary to the CSV
                with open(summary_file, 'a', newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow([experiment_name, model_name.split('/')[-1], red_method, clus_method,
                                    num_clusters, noise_points, clean_cluster_sizes_str]) # <-- Use the clean string here

                print(f"📊 Result: Found {num_clusters} clusters with {noise_points} noise points.")
                print(f"💾 Saved labels to {result_path} and summary to CSV.")
                print("-" * 50)

Loading cached embeddings from embeddings_cache/BAAI_bge-large-en-v1.5.npy...
Reducing dimensions using PCA to 50 components...

--- RUNNING EXPERIMENT: bge-large-en-v1.5__pca__kmeans ---
Performing clustering with KMEANS...
📊 Result: Found 8 clusters with 0 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__pca__kmeans_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: bge-large-en-v1.5__pca__dbscan ---
Performing clustering with DBSCAN...
📊 Result: Found 1 clusters with 166 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__pca__dbscan_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: bge-large-en-v1.5__pca__hdbscan ---
Performing clustering with HDBSCAN...




📊 Result: Found 5 clusters with 1297 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__pca__hdbscan_labels.npy and summary to CSV.
--------------------------------------------------
Reducing dimensions using UMAP to 10 components...


  warn(



--- RUNNING EXPERIMENT: bge-large-en-v1.5__umap__kmeans ---
Performing clustering with KMEANS...
📊 Result: Found 8 clusters with 0 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__umap__kmeans_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: bge-large-en-v1.5__umap__dbscan ---
Performing clustering with DBSCAN...
📊 Result: Found 7 clusters with 79 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__umap__dbscan_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: bge-large-en-v1.5__umap__hdbscan ---
Performing clustering with HDBSCAN...




📊 Result: Found 64 clusters with 964 noise points.
💾 Saved labels to cluster_results/bge-large-en-v1.5__umap__hdbscan_labels.npy and summary to CSV.
--------------------------------------------------
Loading cached embeddings from embeddings_cache/all-MiniLM-L6-v2.npy...
Reducing dimensions using PCA to 50 components...

--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__pca__kmeans ---
Performing clustering with KMEANS...
📊 Result: Found 8 clusters with 0 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__pca__kmeans_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__pca__dbscan ---
Performing clustering with DBSCAN...
📊 Result: Found 15 clusters with 2428 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__pca__dbscan_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__pca__hdbscan ---
Performing clustering with HDBSCAN



📊 Result: Found 4 clusters with 610 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__pca__hdbscan_labels.npy and summary to CSV.
--------------------------------------------------
Reducing dimensions using UMAP to 10 components...


  warn(



--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__umap__kmeans ---
Performing clustering with KMEANS...
📊 Result: Found 8 clusters with 0 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__umap__kmeans_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__umap__dbscan ---
Performing clustering with DBSCAN...
📊 Result: Found 8 clusters with 73 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__umap__dbscan_labels.npy and summary to CSV.
--------------------------------------------------

--- RUNNING EXPERIMENT: all-MiniLM-L6-v2__umap__hdbscan ---
Performing clustering with HDBSCAN...
📊 Result: Found 60 clusters with 867 noise points.
💾 Saved labels to cluster_results/all-MiniLM-L6-v2__umap__hdbscan_labels.npy and summary to CSV.
--------------------------------------------------




### Analysis - (Detailed analysis in seperate Notebook)

In [None]:
import pandas as pd
import numpy as np
import ast # To safely evaluate the string representation of the dictionary

# Load your summary file
summary_df = pd.read_csv('grid_search_summary.csv')

df_pain_points = pd.read_csv('reddit_gpt5_pain_points.csv')
total_points = len(df_pain_points)


# --- 2. Calculate Enhanced Metrics ---
cluster_size_stds = []
noise_ratios = []

for index, row in summary_df.iterrows():
    # --- Calculate Cluster Size Std Dev ---
    cluster_sizes_dict = ast.literal_eval(row['cluster_sizes'])
    real_cluster_sizes = [size for label, size in cluster_sizes_dict.items() if label != -1]
    if len(real_cluster_sizes) > 0:
        std_dev = np.std(real_cluster_sizes)
        cluster_size_stds.append(std_dev)
    else:
        cluster_size_stds.append(0)

    # --- Calculate Noise Ratio ---
    noise_points = row['noise_points']
    ratio = noise_points / total_points
    noise_ratios.append(ratio)

# Add the new metrics to your DataFrame
summary_df['cluster_size_std'] = cluster_size_stds
summary_df['noise_ratio'] = noise_ratios


# --- 3. Re-evaluate the Results with All Metrics ---
# Sort to find the best balance: high cluster count, low noise, and low std dev
final_sorted_summary = summary_df.sort_values(
    by=['num_clusters', 'noise_ratio', 'cluster_size_std'],
    ascending=[False, True, True]
)

print("--- Enhanced Summary with Cluster Balance and Noise Ratio ---")
# Display the most relevant columns for your decision
display(final_sorted_summary[[
    'experiment_name',
    'num_clusters',
    'noise_points',
    'noise_ratio',
    'cluster_size_std',
    'cluster_sizes'
]])

--- Enhanced Summary with Cluster Balance and Noise Ratio ---


Unnamed: 0,experiment_name,num_clusters,noise_points,noise_ratio,cluster_size_std,cluster_sizes
5,bge-large-en-v1.5__umap__hdbscan,64,964,0.372921,17.364704,"{-1: 964, 0: 10, 1: 25, 2: 11, 3: 14, 4: 110, ..."
11,all-MiniLM-L6-v2__umap__hdbscan,60,867,0.335397,25.911366,"{-1: 867, 0: 14, 1: 22, 2: 15, 3: 25, 4: 10, 5..."
7,all-MiniLM-L6-v2__pca__dbscan,15,2428,0.939265,9.756138,"{-1: 2428, 0: 21, 1: 44, 2: 9, 3: 7, 4: 8, 5: ..."
0,bge-large-en-v1.5__pca__kmeans,8,0,0.0,46.755849,"{0: 405, 1: 278, 2: 292, 3: 311, 4: 356, 5: 37..."
3,bge-large-en-v1.5__umap__kmeans,8,0,0.0,96.027909,"{0: 357, 1: 372, 2: 349, 3: 310, 4: 115, 5: 46..."
9,all-MiniLM-L6-v2__umap__kmeans,8,0,0.0,122.987233,"{0: 572, 1: 351, 2: 295, 3: 120, 4: 212, 5: 37..."
6,all-MiniLM-L6-v2__pca__kmeans,8,0,0.0,169.904707,"{0: 576, 1: 568, 2: 351, 3: 100, 4: 209, 5: 21..."
10,all-MiniLM-L6-v2__umap__dbscan,8,73,0.02824,806.217402,"{-1: 73, 0: 2447, 1: 14, 2: 7, 3: 6, 4: 6, 5: ..."
4,bge-large-en-v1.5__umap__dbscan,7,79,0.030561,850.402762,"{-1: 79, 0: 2441, 1: 25, 2: 10, 3: 9, 4: 9, 5:..."
2,bge-large-en-v1.5__pca__hdbscan,5,1297,0.501741,488.716523,"{-1: 1297, 0: 10, 1: 11, 2: 11, 3: 21, 4: 1235}"


- DBScan tends to give Mega Cluster Problem - either towards the noise cluster, or towards a single cluster
- Kmeans seems to be balanced, but again its configured to be so, without the noise..   
- all_miniLM_pca_hdbscan looks promising, but only has 4 clusters and a mega cluster at 3
- umap + hdbscan seems good on the noise ratio but has hih number of clusters.

### DeepDive into Best combination from experiments -- BAAI + Umap + HDBScan
experimenting with different min_cluster_size


In [None]:
# --- 1. Load the winning embeddings and reduce them ---
# Ensure your helper functions (get_embeddings, reduce_dimensions) are defined in the notebook

# The comments list should already be loaded from your previous work in the notebook
# comments = df_pain_points['comment_text'].tolist()

print("Loading data for the winning pipeline: 'bge-large-en-v1.5' + 'umap'")
embeddings = get_embeddings(comments, model_name='BAAI/bge-large-en-v1.5')
reduced_embeds = reduce_dimensions(embeddings, method='umap', n_components=10)

print(f"\n✅ Data is ready. Shape of reduced embeddings: {reduced_embeds.shape}")

Loading data for the winning pipeline: 'bge-large-en-v1.5' + 'umap'
Loading cached embeddings from embeddings_cache/BAAI_bge-large-en-v1.5.npy...
Reducing dimensions using UMAP to 10 components...


  warn(



✅ Data is ready. Shape of reduced embeddings: (2585, 10)


In [None]:
import csv
import os

# (Assuming all setup, helper functions, and data loading are done)

# --- The Final Tuning Loop with Logging ---

# 1. Define the tuning parameters and setup the log file
min_cluster_sizes_to_test = [20, 30, 40, 50]
tuning_summary_file = 'tuning_summary.csv'

# Create header for the summary file if it doesn't exist
if not os.path.exists(tuning_summary_file):
    with open(tuning_summary_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['min_cluster_size', 'num_clusters', 'noise_points'])

print("--- Starting Phase 2: Tuning HDBSCAN's min_cluster_size ---")
for size in min_cluster_sizes_to_test:
    print(f"\n--- Testing with min_cluster_size = {size} ---")

    labels = perform_clustering(
        reduced_embeds,
        method='hdbscan',
        min_cluster_size=size
    )

    # Analyze the result
    unique_labels, counts = np.unique(labels, return_counts=True)
    cluster_info = dict(zip(unique_labels, counts))
    num_clusters = len(unique_labels) - (1 if -1 in cluster_info else 0)
    noise_points = cluster_info.get(-1, 0)

    print(f"📊 Result: Found {num_clusters} clusters with {noise_points} noise points.")

    # --- NEW: Save the result to the CSV ---
    with open(tuning_summary_file, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([size, num_clusters, noise_points])

    print(f"💾 Logged result to {tuning_summary_file}")

--- Starting Phase 2: Tuning HDBSCAN's min_cluster_size ---

--- Testing with min_cluster_size = 20 ---
Performing clustering with HDBSCAN...




📊 Result: Found 29 clusters with 870 noise points.
💾 Logged result to tuning_summary.csv

--- Testing with min_cluster_size = 30 ---
Performing clustering with HDBSCAN...




📊 Result: Found 18 clusters with 887 noise points.
💾 Logged result to tuning_summary.csv

--- Testing with min_cluster_size = 40 ---
Performing clustering with HDBSCAN...




📊 Result: Found 14 clusters with 851 noise points.
💾 Logged result to tuning_summary.csv

--- Testing with min_cluster_size = 50 ---
Performing clustering with HDBSCAN...




📊 Result: Found 10 clusters with 682 noise points.
💾 Logged result to tuning_summary.csv


In [None]:
# --- Finalize the Clustering ---

# 1. Set your chosen parameter
final_min_cluster_size = 50

print(f"Finalizing clustering with min_cluster_size = {final_min_cluster_size}...")

# 2. Run the clustering one last time
final_labels = perform_clustering(
    reduced_embeds,
    method='hdbscan',
    min_cluster_size=final_min_cluster_size
)

# 3. Save the final labels and the labeled CSV
np.save('final_cluster_labels.npy', final_labels)

df_final_labeled = pd.read_csv('reddit_gpt5_pain_points.csv')
df_final_labeled['cluster'] = final_labels
df_final_labeled.to_csv('final_labeled_pain_points.csv', index=False)

print(f"\n✅ Success! Final labeled dataset is saved to 'final_labeled_pain_points.csv'.")


Finalizing clustering with min_cluster_size = 50...
Performing clustering with HDBSCAN...





✅ Success! Final labeled dataset is saved to 'final_labeled_pain_points.csv'.


## Topic Modelling

### BerTopic

In [None]:
# Make sure you have the libraries installed
!pip install bertopic sentence-transformers

import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# 1. Load your final labeled data
df = pd.read_csv('final_labeled_pain_points.csv')
docs = df['comment_text'].tolist()
labels = df['cluster'].tolist()


Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
import numpy as np

# Find unique labels
unique_labels = np.unique(labels)

# Print unique labels
print("Unique cluster labels:", unique_labels)

Unique cluster labels: [-1  0  1  2  3  4  5  6  7  8  9]


In [None]:

# 2. Initialize BERTopic
# We use CountVectorizer to get simple keywords as required by the assignment
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model, min_topic_size=5)

# 3. Fit the model using your pre-computed labels
# The 'y' parameter tells BERTopic to use our clusters instead of finding its own
print("Running BERTopic analysis on your 10 clusters...")
topics, probs = topic_model.fit_transform(docs, y=labels)

# 4. Display the results
print("\n--- BERTopic Results ---")
# This shows the keywords and details for each of your clusters
display(topic_model.get_topic_info())

# You can also get the top keywords for a specific cluster like this:
print("\nKeywords for Cluster 0:")
print(topic_model.get_topic(0))

Running BERTopic analysis on your 10 clusters...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- BERTopic Results ---


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,842,-1_like_gpt5_model_users,"[like, gpt5, model, users, models, just, ai, p...",[**IT KEEPS ON DOING WORSE THAN PAST MODELS - ...
1,0,200,0_4o_writing_bring_41,"[4o, writing, bring, 41, friend, just, like, 4...",[I really hope they go back on this decision. ...
2,1,113,1_voice_standard_mode_advanced,"[voice, standard, mode, advanced, avm, cove, s...",[Please bring back 4.1 and standard voice mode...
3,2,96,2_context_window_32k_128k,"[context, window, 32k, 128k, token, tokens, pl...",[will you ever give plus users a decent contex...
4,3,95,3_chatgpt_chat_want_years,"[chatgpt, chat, want, years, just, im, ive, li...",[I have been a ChatGPT plus user for around 2 ...
...,...,...,...,...,...
72,71,6,71_minecraft_games_ai_follow,"[minecraft, games, ai, follow, acess, delate, ...",[I have absolutely no idea what AI Minecraft i...
73,72,6,72_blot_intps_dufus_intp,"[blot, intps, dufus, intp, httpswwwnytimescom2...",[> Dufus\n\nRude\n\n> I will say it again\n\nT...
74,73,5,73_favoured_costumers_prenumeration_unfold,"[favoured, costumers, prenumeration, unfold, h...",[I'll give it a couple of weeks and if 40 is n...
75,74,5,74_risk_plugs_proprietary_risky,"[risk, plugs, proprietary, risky, anytime, cat...","[I mean, that’s the risk with any type of tech..."



Keywords for Cluster 0:
[('4o', np.float64(0.03898534300858078)), ('writing', np.float64(0.019013566480862366)), ('bring', np.float64(0.016622144660418003)), ('41', np.float64(0.016192087190156426)), ('friend', np.float64(0.01529729873039188)), ('just', np.float64(0.01367759111132369)), ('like', np.float64(0.013148414282750195)), ('45', np.float64(0.012238913120778206)), ('creative', np.float64(0.012015134443516429)), ('want', np.float64(0.010005754922573013))]


In [None]:
import pandas as pd
import numpy as np

# --- 1. Save the full, trained model object ---
# This saves the entire model, allowing you to load it later without retraining.
topic_model.save("bertopic_model", serialization="safetensors")
print("✅ Full BERTopic model saved to the 'bertopic_model' folder.")


# --- 2. Save the summary table of topics and keywords to a CSV ---
# This is the most important file for your analysis and report.
topic_summary_df = topic_model.get_topic_info()
topic_summary_df.to_csv("bertopic_summary.csv", index=False)
print("✅ Topic summary saved to 'bertopic_summary.csv'.")


# --- 3. Save the topic probabilities ---
# This saves the confidence score for each document's topic assignment.
np.save('bertopic_probabilities.npy', probs)
print("✅ Topic probabilities saved to 'bertopic_probabilities.npy'.")

✅ Full BERTopic model saved to the 'bertopic_model' folder.
✅ Topic summary saved to 'bertopic_summary.csv'.
✅ Topic probabilities saved to 'bertopic_probabilities.npy'.


In [None]:
print(topic_summary_df.shape)

(72, 5)


### Only TF-IDF on existing clusters

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load your final labeled data
df = pd.read_csv('final_labeled_pain_points.csv')

print("--- Running Direct TF-IDF Analysis on Final Clusters ---")

# 2. Prepare the corpus
corpus = []
cluster_ids = sorted([c for c in df['cluster'].unique() if c != -1])
for cluster_id in cluster_ids:
    cluster_text = " ".join(df[df['cluster'] == cluster_id]['comment_text'].tolist())
    corpus.append(cluster_text)

print(f"Created a corpus of {len(corpus)} documents (one for each cluster).")

# 3. Run TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names_out()

# --- NEW: Setup to store results ---
topic_results = []

# 4. Extract and store the top keywords for each cluster
print("\n--- Top Keywords per Cluster ---")
for i, cluster_id in enumerate(cluster_ids):
    feature_scores = tfidf_matrix[i, :].toarray().flatten()
    top_indices = feature_scores.argsort()[-20:]
    top_keywords = [feature_names[j] for j in reversed(top_indices)]

    # Get the number of comments in the cluster
    num_comments = len(df[df['cluster'] == cluster_id])

    # Store the results
    topic_results.append({
        'cluster_id': cluster_id,
        'num_comments': num_comments,
        'top_keywords': ", ".join(top_keywords)
    })

    print(f"\nCluster {cluster_id} ({num_comments} comments): {', '.join(top_keywords)}")

# --- NEW: Save the results to a DataFrame and CSV ---
results_df = pd.DataFrame(topic_results)
results_df.to_csv('tfidf_topic_summary.csv', index=False)

print("\n\n✅ Success! Topic keywords have been saved to 'tfidf_topic_summary.csv'")
display(results_df)

--- Running Direct TF-IDF Analysis on Final Clusters ---
Created a corpus of 10 documents (one for each cluster).

--- Top Keywords per Cluster ---

Cluster 0 (110 comments): voice, standard voice, standard, voice mode, mode, avm, advanced, advanced voice, like, just, don, cove, use, 4o, gpt, sound, users, model, feels, user

Cluster 1 (86 comments): gpt, image, model, gpt5, instructions, instruction, 4o, worse, output, promising, just, review, error, following, code, gpt 4o, ignore, defined, correctly, files

Cluster 2 (928 comments): 4o, gpt, model, models, like, plus, just, users, o3, use, bring, thinking, don, mini, people, plus users, want, writing, pro, router

Cluster 3 (69 comments): gpt, model, openai, 4o, models, users, o3, just, like, gpt 4o, o4, mini, people, o4 mini, legacy, user, ai, trust, ve, access

Cluster 4 (66 comments): ama, comments, comment, just, like, people, don, questions, response, really, sama, guys, know, answer, way, did, going, question, time, think

Clu

Unnamed: 0,cluster_id,num_comments,top_keywords
0,0,110,"voice, standard voice, standard, voice mode, m..."
1,1,86,"gpt, image, model, gpt5, instructions, instruc..."
2,2,928,"4o, gpt, model, models, like, plus, just, user..."
3,3,69,"gpt, model, openai, 4o, models, users, o3, jus..."
4,4,66,"ama, comments, comment, just, like, people, do..."
5,5,225,"context, window, context window, chatgpt, 32k,..."
6,6,76,"chatgpt, like, 4o, just, chat, gpt, want, peop..."
7,7,94,"gpt, 4o, gpt 4o, emotional, model, users, like..."
8,8,69,"sexual, people, like, just, harmful, content, ..."
9,9,180,"ai, people, like, openai, just, safety, don, u..."


### ReRanking


In [None]:
!pip install --upgrade bertopic



In [None]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer

# 1. Load your final labeled data
df = pd.read_csv('final_labeled_pain_points.csv')
print(f"✅ Loaded {len(df)} comments across {df['cluster'].nunique()} clusters.")

# 2. Setup a list to store the results
final_topic_summary = []

# Get the IDs of your actual clusters (ignoring noise)
cluster_ids = sorted([c for c in df['cluster'].unique() if c != -1])

print("\n--- Analyzing Each Cluster Individually with Re-ranking---")
# 3. Loop through each cluster
for cluster_id in cluster_ids:
    # Get the comments for just this one cluster
    single_cluster_docs = df[df['cluster'] == cluster_id]['comment_text'].tolist()

    print(f"\nProcessing Cluster {cluster_id} ({len(single_cluster_docs)} comments)...")

    # --- Initialize BERTopic with the Re-ranker ---
    representation_model = MaximalMarginalRelevance(diversity=0.3)

    topic_model = BERTopic(
        representation_model=representation_model,
        vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(1, 2)),
        min_topic_size=5
    )

    # Fit the model on ONLY this cluster's comments
    topics, probs = topic_model.fit_transform(single_cluster_docs)

    # Get the most prominent topic found (usually Topic 0)
    top_topic_keywords = [word for word, score in topic_model.get_topic(0)]

    # Store the result
    final_topic_summary.append({
        'cluster_id': cluster_id,
        'num_comments': len(single_cluster_docs),
        'reranked_keywords': ", ".join(top_topic_keywords)
    })

# 4. Create a final DataFrame and save it
final_summary_df = pd.DataFrame(final_topic_summary)
final_summary_df.to_csv("final_bertopic_reranked_summary.csv", index=False)

print("\n\n✅ Success! Final re-ranked topic summary saved to 'final_bertopic_reranked_summary.csv'")
display(final_summary_df)

✅ Loaded 2585 comments across 11 clusters.

--- Analyzing Each Cluster Individually with Re-ranking---

Processing Cluster 0 (110 comments)...

Processing Cluster 1 (86 comments)...

Processing Cluster 2 (928 comments)...

Processing Cluster 3 (69 comments)...

Processing Cluster 4 (66 comments)...

Processing Cluster 5 (225 comments)...

Processing Cluster 6 (76 comments)...

Processing Cluster 7 (94 comments)...

Processing Cluster 8 (69 comments)...

Processing Cluster 9 (180 comments)...


✅ Success! Final re-ranked topic summary saved to 'final_bertopic_reranked_summary.csv'


Unnamed: 0,cluster_id,num_comments,reranked_keywords
0,0,110,"voice, mode, voice mode, standard voice, advan..."
1,1,86,"image gen, generation, sora, image generation,..."
2,2,928,"models, models settings, legacy models, old mo..."
3,3,69,"openai, models, benchmarks, does openai, opena..."
4,4,66,"ama, comments, sam, answer questions, jits, an..."
5,5,225,"window, context window, 32k, use, plus users, ..."
6,6,76,"chatgpt, models, use, gpt5, work, chat, feels ..."
7,7,94,"4o, gpt4o, model, gpt5, emotional, support, gp..."
8,8,69,"chatgpt, filter, content, adults, users, harmf..."
9,9,180,"openai, ai, safety, users, models, trust, open..."


### T-Flan based topic Modelling


In [None]:
# Delete the pipeline object
del generator

# Import torch and clear the GPU cache
import torch
torch.cuda.empty_cache()

print("GPU memory has been cleared.")

NameError: name 'generator' is not defined

In [None]:
# ---------------- 0. Imports ----------------
import pandas as pd
import torch, re
from transformers import pipeline

# ---------------- 1. Data ----------------
df = pd.read_csv("final_bertopic_reranked_summary.csv")

# ---------------- 2. Helper ----------------
def clean_keywords(raw: str, max_kw: int = 12) -> str:
    """
    • removes exact duplicates while keeping order
    • trims to first *max_kw* tokens (saves prompt space)
    """
    uniq = list(dict.fromkeys([w.strip() for w in raw.split(",") if w.strip()]))
    return ", ".join(uniq[:max_kw])

# ---------------- 3. Model ----------------
print("Loading Flan-T5-Large …")
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",    # fits in ≤4 GB VRAM
    device_map="auto",               # puts weights on GPU if available
    torch_dtype=torch.float16,       # halves memory footprint
    # decoding parameters that avoid loops
    num_beams=4,
    no_repeat_ngram_size=3,
    repetition_penalty=0.8,
    # temperature=0.7,
)
print("Model loaded.\n")

# ---------------- 4. Generation loop ----------------
flan_results = []
print("--- Generating topic labels ---")
for _, row in df.iterrows():
    cluster_id = row["cluster_id"]
    keywords    = clean_keywords(row["reranked_keywords"])

    prompt = (
        "Give ONE short topic label.\n\n"
        f"Keywords: {keywords}\n\n"
        "Label:"
    )
    print(f"Jeywords : {keywords}")
    out = generator(prompt, max_new_tokens=8)[0]["generated_text"].strip()

    flan_results.append(
        {"cluster_id": cluster_id,
         "flan_t5_topic_label": out,
         "source_keywords": keywords}
    )
    print(f"{cluster_id:>2}: {out}")

# ---------------- 5. Save ----------------
flan_df = pd.DataFrame(flan_results)
flan_df.to_csv("flan_t5_reranked_keyword_summary.csv", index=False)
print("\nSaved → flan_t5_reranked_keyword_summary.csv")


Loading Flan-T5-Large …


Device set to use cuda:0


Model loaded.

--- Generating topic labels ---
Jeywords : voice, mode, voice mode, standard voice, advanced voice, avm, different, chatgpt, audio, gpt
 0: avm gpt
Jeywords : image gen, generation, sora, image generation, video, gpt4o, censorship, create, 2d, create image
 1: gpt4o -
Jeywords : models, models settings, legacy models, old models, model picker, anymore, default, remove models, option, choose model
 2: choose model from the model picker to
Jeywords : openai, models, benchmarks, does openai, openais, transparency, openai chosen, time awareness, like o5, asking mars
 3: openai -lrb
Jeywords : ama, comments, sam, answer questions, jits, answered, links, limit thats, profiles, profiles comment
 4: sam jones ama
Jeywords : window, context window, 32k, use, plus users, chatgpt, conversation, memory, tokens, api
 5: chatgpt uses 32k of
Jeywords : chatgpt, models, use, gpt5, work, chat, feels like, pro, personality, friend
 6: chatgpt is a free
Jeywords : 4o, gpt4o, model, gpt5, e

### LLM Based Topic Modelling

In [None]:
# !pip install tiktoken

import tiktoken

# Load all your comments into a single string
all_comments_text = " ".join(df['comment_text'].tolist())

# Use a standard tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
tokens = tokenizer.encode(all_comments_text)

token_count = len(tokens)
print(f"The total token count for your dataset is approximately: {token_count:,}")
# Total tokens in our entire dataset - Not per cluster..

The total token count for your dataset is approximately: 221,975


In [None]:
! pip install langchain langchain-groq

Collecting langchain-groq
  Downloading langchain_groq-0.3.7-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.30.0 (from langchain-groq)
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Downloading langchain_groq-0.3.7-py3-none-any.whl (16 kB)
Downloading groq-0.31.0-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.4/131.4 kB[0m [31m859.8 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain-groq
Successfully installed groq-0.31.0 langchain-groq-0.3.7


### Topic Modelling Using Gemma 2 (9B Params)

In [None]:

import pandas as pd
import os
import time  # NEW: Import the time module
from langchain_groq import ChatGroq
from google.colab import userdata
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import random

# --- 1. Setup ---
df = pd.read_csv('final_labeled_pain_points.csv')
llm = ChatGroq(model_name="gemma2-9b-it", groq_api_key=userdata.get("GROQ_API_KEY"))

# --- 2. Create a Standardized Chain ---
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at analyzing user feedback. Your goal is to find the core pain point from a set of comments and provide a concise, descriptive topic label of 5 words or less."),
    ("human", "Here are the user complaints:\n{comment_sample}\n\nConcise Topic Label:")
])
parser = StrOutputParser()
chain = prompt | llm | parser

# --- 3. Loop, Sample, and Generate Topics ---
llm_results = []
print("\n--- Generating Topic Labels with LangChain & Groq ---")

cluster_ids_to_process = sorted([c for c in df['cluster'].unique() if c != -1])
num_clusters = len(cluster_ids_to_process)

for i, cluster_id in enumerate(cluster_ids_to_process):
    cluster_comments = df[df['cluster'] == cluster_id]['comment_text'].tolist()

    # Take a random sample of 20 comments out of approximately 200+ in each cluster  ---
    sample_size = 20
    # Ensure sample size is not larger than the number of comments in the cluster
    k = min(sample_size, len(cluster_comments))
    sample = random.sample(cluster_comments, k=k)

    formatted_sample = "\n".join([f"- {comment}" for comment in sample])

    print(f"Analyzing Cluster {cluster_id} ({i+1}/{num_clusters}) with a random sample of {k} comments...")
    topic_label = chain.invoke({"comment_sample": formatted_sample})

    llm_results.append({
        'cluster_id': cluster_id,
        'llm_topic_label': topic_label.replace('"', ''),
        'num_comments': len(cluster_comments)
    })
    print(f"  -> Topic: {topic_label}")

    if i < num_clusters - 1:
        print("\nWaiting for 60 seconds...")
        time.sleep(60)

# --- 4. Save the Final LLM Topics ---
llm_df = pd.DataFrame(llm_results)
llm_df.to_csv('llm_topic_summary.csv', index=False)
display(llm_df)


--- Generating Topic Labels with LangChain & Groq ---
Analyzing Cluster 0 (1/10) with a random sample of 20 comments...
  -> Topic: User Preference for Standard Voice 


Waiting for 60 seconds...
Analyzing Cluster 1 (2/10) with a random sample of 20 comments...
  -> Topic: GPT-5 Performance Issues 


Waiting for 60 seconds...
Analyzing Cluster 2 (3/10) with a random sample of 20 comments...
  -> Topic: **Bring Back Legacy Models** 


Waiting for 60 seconds...
Analyzing Cluster 3 (4/10) with a random sample of 20 comments...
  -> Topic: **Loss of User Choice & Legacy Models** 


Waiting for 60 seconds...
Analyzing Cluster 4 (5/10) with a random sample of 20 comments...
  -> Topic: **User distrust and criticism** 


Waiting for 60 seconds...
Analyzing Cluster 5 (6/10) with a random sample of 20 comments...
  -> Topic: **Limited Context Window** 


Waiting for 60 seconds...
Analyzing Cluster 6 (7/10) with a random sample of 20 comments...
  -> Topic: **Bring Back ChatGPT 4.0** 


Waiting

Unnamed: 0,cluster_id,llm_topic_label,num_comments
0,0,User Preference for Standard Voice \n,110
1,1,GPT-5 Performance Issues \n,86
2,2,**Bring Back Legacy Models** \n,928
3,3,**Loss of User Choice & Legacy Models** \n,69
4,4,**User distrust and criticism** \n,66
5,5,**Limited Context Window** \n,225
6,6,**Bring Back ChatGPT 4.0** \n,76
7,7,**Loss of emotional connection and usability**...,94
8,8,**Overly Restrictive Content Filtering** \n,69
9,9,**User Frustration with AI Restrictions** \n,180


In [None]:
import pandas as pd
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from google.colab import userdata
# --- 1. Setup ---
# Load your TF-IDF keyword summary
try:
    df = pd.read_csv('tfidf_topic_summary.csv')
    print("✅ Loaded TF-IDF keyword summary.")
except FileNotFoundError:
    print("❌ Error: 'tfidf_topic_summary.csv' not found.")
    df = None

if df is not None:
    # Initialize the Groq client via LangChain
    llm = ChatGroq(model_name="gemma2-9b-it", groq_api_key=userdata.get("GROQ_API_KEY"))

    # --- 2. Create a Standardized Chain ---
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at analyzing user feedback. Your goal is to synthesize a list of keywords into a concise, descriptive topic label of 5 words or less."),
        ("human", "Here are the keywords for a topic:\n{keywords}\n\nConcise Topic Label:")
    ])
    parser = StrOutputParser()
    chain = prompt | llm | parser

    # --- 3. Loop through keywords and generate topic labels ---
    groq_results = []
    print("\n--- Generating Topic Labels from Keywords with Groq ---")

    for index, row in df.iterrows():
        cluster_id = row['cluster_id']
        keywords = row['top_keywords']

        print(f"Analyzing Cluster {cluster_id}...")
        # Invoke the chain with the keywords
        topic_label = chain.invoke({"keywords": keywords})

        groq_results.append({
            'cluster_id': cluster_id,
            'groq_topic_label': topic_label.replace('"', ''),
            'source_keywords': keywords
        })
        print(f"  -> Topic: {topic_label}")

    # --- 4. Save the Final Groq Topics ---
    groq_df = pd.DataFrame(groq_results)
    groq_df.to_csv('gemma2_keyword_summary.csv', index=False)

    print("\n\n✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'")
    display(groq_df)

✅ Loaded TF-IDF keyword summary.

--- Generating Topic Labels from Keywords with Groq ---
Analyzing Cluster 0...
  -> Topic: Advanced Voice Mode Features 

Analyzing Cluster 1...
  -> Topic: GPT-5 Image Model Performance 

Analyzing Cluster 2...
  -> Topic: GPT Model User Experiences 

Analyzing Cluster 3...
  -> Topic: OpenAI GPT-4 Users & Feedback 

Analyzing Cluster 4...
  -> Topic: User Feedback & Discussion 

Analyzing Cluster 5...
  -> Topic: ChatGPT Context Window Size 

Analyzing Cluster 6...
  -> Topic: People Want ChatGPT-Like Models 

Analyzing Cluster 7...
  -> Topic: GPT-4O User Emotional Response 

Analyzing Cluster 8...
  -> Topic: Harmful Sexual Content Filtering 

Analyzing Cluster 9...
  -> Topic: AI Safety and User Trust 



✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'


Unnamed: 0,cluster_id,groq_topic_label,source_keywords
0,0,Advanced Voice Mode Features \n,"voice, standard voice, standard, voice mode, m..."
1,1,GPT-5 Image Model Performance \n,"gpt, image, model, gpt5, instructions, instruc..."
2,2,GPT Model User Experiences \n,"4o, gpt, model, models, like, plus, just, user..."
3,3,OpenAI GPT-4 Users & Feedback \n,"gpt, model, openai, 4o, models, users, o3, jus..."
4,4,User Feedback & Discussion \n,"ama, comments, comment, just, like, people, do..."
5,5,ChatGPT Context Window Size \n,"context, window, context window, chatgpt, 32k,..."
6,6,People Want ChatGPT-Like Models \n,"chatgpt, like, 4o, just, chat, gpt, want, peop..."
7,7,GPT-4O User Emotional Response \n,"gpt, 4o, gpt 4o, emotional, model, users, like..."
8,8,Harmful Sexual Content Filtering \n,"sexual, people, like, just, harmful, content, ..."
9,9,AI Safety and User Trust \n,"ai, people, like, openai, just, safety, don, u..."


### Topic Modelling using GPT-OSS (120B params)

In [None]:

# --- 1. Setup ---
df = pd.read_csv('final_labeled_pain_points.csv')
llm = ChatGroq(model_name="openai/gpt-oss-120b", groq_api_key=userdata.get("GROQ_API_KEY"))

# --- 2. Create a Standardized Chain ---
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at analyzing user feedback. Your goal is to find the core pain point from a set of comments and provide a concise, descriptive topic label of 5 words or less."),
    ("human", "Here are the user complaints:\n{comment_sample}\n\nConcise Topic Label:")
])
parser = StrOutputParser()
chain = prompt | llm | parser

# --- 3. Loop, Sample, and Generate Topics ---
llm_results = []
print("\n--- Generating Topic Labels with LangChain & Groq ---")

cluster_ids_to_process = sorted([c for c in df['cluster'].unique() if c != -1])
num_clusters = len(cluster_ids_to_process)

for i, cluster_id in enumerate(cluster_ids_to_process):
    cluster_comments = df[df['cluster'] == cluster_id]['comment_text'].tolist()

    # --- Sample of size 50 for gpt-oss with a longer context window than gemma ---
    sample_size = 20
    # Ensure sample size is not larger than the number of comments in the cluster
    k = min(sample_size, len(cluster_comments))
    sample = random.sample(cluster_comments, k=k)

    formatted_sample = "\n".join([f"- {comment}" for comment in sample])

    print(f"Analyzing Cluster {cluster_id} ({i+1}/{num_clusters}) with a random sample of {k} comments...")
    topic_label = chain.invoke({"comment_sample": formatted_sample})

    llm_results.append({
        'cluster_id': cluster_id,
        'llm_topic_label': topic_label.replace('"', ''),
        'num_comments': len(cluster_comments)
    })
    print(f"  -> Topic: {topic_label}")

    if i < num_clusters - 1:
        print("\nWaiting for 60 seconds...")
        time.sleep(60)

# --- 4. Save the Final LLM Topics ---
llm_df = pd.DataFrame(llm_results)
llm_df.to_csv('gpt-oss_topic_summary.csv', index=False)
display(llm_df)


--- Generating Topic Labels with LangChain & Groq ---
Analyzing Cluster 0 (1/10) with a random sample of 20 comments...
  -> Topic: Loss of Preferred Standard Voice Mode

Waiting for 60 seconds...
Analyzing Cluster 1 (2/10) with a random sample of 20 comments...
  -> Topic: Unmet expectations and degraded performance

Waiting for 60 seconds...
Analyzing Cluster 2 (3/10) with a random sample of 20 comments...
  -> Topic: Restore GPT‑4o model option

Waiting for 60 seconds...
Analyzing Cluster 3 (4/10) with a random sample of 20 comments...
  -> Topic: Removed models break workflow continuity

Waiting for 60 seconds...
Analyzing Cluster 4 (5/10) with a random sample of 20 comments...
  -> Topic: Ignored feedback and perceived censorship

Waiting for 60 seconds...
Analyzing Cluster 5 (6/10) with a random sample of 20 comments...
  -> Topic: Limited context window size

Waiting for 60 seconds...
Analyzing Cluster 6 (7/10) with a random sample of 20 comments...
  -> Topic: Missing legacy m

Unnamed: 0,cluster_id,llm_topic_label,num_comments
0,0,Loss of Preferred Standard Voice Mode,110
1,1,Unmet expectations and degraded performance,86
2,2,Restore GPT‑4o model option,928
3,3,Removed models break workflow continuity,69
4,4,Ignored feedback and perceived censorship,66
5,5,Limited context window size,225
6,6,Missing legacy model options,76
7,7,Forced Model Switch Removes Empathy,94
8,8,Excessive censorship hindering legitimate use,69
9,9,**Trust and transparency issues**,180


In [None]:
import pandas as pd
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from google.colab import userdata
# --- 1. Setup ---
# Load your TF-IDF keyword summary
try:
    df = pd.read_csv('final_bertopic_reranked_summary.csv')
    print("✅ Loaded TF-IDF keyword summary.")
except FileNotFoundError:
    print("❌ Error: 'tfidf_topic_summary.csv' not found.")
    df = None

if df is not None:
    # Initialize the Groq client via LangChain
    llm = ChatGroq(model_name="openai/gpt-oss-120b", groq_api_key=userdata.get("GROQ_API_KEY"))

    # --- 2. Create a Standardized Chain ---
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at analyzing user feedback. Your goal is to synthesize a list of keywords into a concise, descriptive topic label of 5 words or less."),
        ("human", "Here are the keywords for a topic:\n{keywords}\n\nConcise Topic Label:")
    ])
    parser = StrOutputParser()
    chain = prompt | llm | parser

    # --- 3. Loop through keywords and generate topic labels ---
    groq_results = []
    print("\n--- Generating Topic Labels from Keywords with Groq ---")

    for index, row in df.iterrows():
        cluster_id = row['cluster_id']
        keywords = row['reranked_keywords']

        print(f"Analyzing Cluster {cluster_id}...")
        # Invoke the chain with the keywords
        topic_label = chain.invoke({"keywords": keywords})

        groq_results.append({
            'cluster_id': cluster_id,
            'groq_topic_label': topic_label.replace('"', ''),
            'source_keywords': keywords
        })
        print(f"  -> Topic: {topic_label}")

    # --- 4. Save the Final Groq Topics ---
    groq_df = pd.DataFrame(groq_results)
    groq_df.to_csv('gpt-oss_rr_keyword_summary.csv', index=False)

    print("\n\n✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'")
    display(groq_df)

✅ Loaded TF-IDF keyword summary.

--- Generating Topic Labels from Keywords with Groq ---
Analyzing Cluster 0...
  -> Topic: ChatGPT Voice Mode Options
Analyzing Cluster 1...
  -> Topic: Sora Image and Video Generation
Analyzing Cluster 2...
  -> Topic: **Model Management and Selection**
Analyzing Cluster 3...
  -> Topic: OpenAI Model Transparency Benchmarks
Analyzing Cluster 4...
  -> Topic: AMA Answers and Profile Comments
Analyzing Cluster 5...
  -> Topic: ChatGPT Plus 32k Context
Analyzing Cluster 6...
  -> Topic: ChatGPT as Professional Companion
Analyzing Cluster 7...
  -> Topic: Choosing GPT‑4o for Emotional Support
Analyzing Cluster 8...
  -> Topic: ChatGPT Harmful Content Filtering
Analyzing Cluster 9...
  -> Topic: OpenAI AI Safety & Trust


✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'


Unnamed: 0,cluster_id,groq_topic_label,source_keywords
0,0,ChatGPT Voice Mode Options,"voice, mode, voice mode, standard voice, advan..."
1,1,Sora Image and Video Generation,"image gen, generation, sora, image generation,..."
2,2,**Model Management and Selection**,"models, models settings, legacy models, old mo..."
3,3,OpenAI Model Transparency Benchmarks,"openai, models, benchmarks, does openai, opena..."
4,4,AMA Answers and Profile Comments,"ama, comments, sam, answer questions, jits, an..."
5,5,ChatGPT Plus 32k Context,"window, context window, 32k, use, plus users, ..."
6,6,ChatGPT as Professional Companion,"chatgpt, models, use, gpt5, work, chat, feels ..."
7,7,Choosing GPT‑4o for Emotional Support,"4o, gpt4o, model, gpt5, emotional, support, gp..."
8,8,ChatGPT Harmful Content Filtering,"chatgpt, filter, content, adults, users, harmf..."
9,9,OpenAI AI Safety & Trust,"openai, ai, safety, users, models, trust, open..."


### Topic Modelling Using Kimi K2


In [None]:
import pandas as pd
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from google.colab import userdata
# --- 1. Setup ---
# Load your TF-IDF keyword summary
try:
    df = pd.read_csv('final_bertopic_reranked_summary.csv')
    print("✅ Loaded TF-IDF keyword summary.")
except FileNotFoundError:
    print("❌ Error: 'tfidf_topic_summary.csv' not found.")
    df = None

if df is not None:
    # Initialize the Groq client via LangChain
    llm = ChatGroq(model_name="moonshotai/kimi-k2-instruct", groq_api_key=userdata.get("GROQ_API_KEY"))

    # --- 2. Create a Standardized Chain ---
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at analyzing user feedback. Your goal is to synthesize a list of keywords into a concise, descriptive topic label of 5 words or less."),
        ("human", "Here are the keywords for a topic:\n{keywords}\n\nConcise Topic Label:")
    ])
    parser = StrOutputParser()
    chain = prompt | llm | parser

    # --- 3. Loop through keywords and generate topic labels ---
    groq_results = []
    print("\n--- Generating Topic Labels from Keywords with Groq ---")

    for index, row in df.iterrows():
        cluster_id = row['cluster_id']
        keywords = row['reranked_keywords']

        print(f"Analyzing Cluster {cluster_id}...")
        # Invoke the chain with the keywords
        topic_label = chain.invoke({"keywords": keywords})

        groq_results.append({
            'cluster_id': cluster_id,
            'groq_topic_label': topic_label.replace('"', ''),
            'source_keywords': keywords
        })
        print(f"  -> Topic: {topic_label}")

    # --- 4. Save the Final Groq Topics ---
    groq_df = pd.DataFrame(groq_results)
    groq_df.to_csv('kimik2_keyword_summary_rr.csv', index=False)

    print("\n\n✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'")
    display(groq_df)

✅ Loaded TF-IDF keyword summary.

--- Generating Topic Labels from Keywords with Groq ---
Analyzing Cluster 0...
  -> Topic: Voice Mode Options
Analyzing Cluster 1...
  -> Topic: AI Image & Video Creation
Analyzing Cluster 2...
  -> Topic: Model picker removal
Analyzing Cluster 3...
  -> Topic: OpenAI Model Transparency
Analyzing Cluster 4...
  -> Topic: AMA Profiles & Comments
Analyzing Cluster 5...
  -> Topic: ChatGPT 32K Context Window
Analyzing Cluster 6...
  -> Topic: Pro ChatGPT Experience
Analyzing Cluster 7...
  -> Topic: Emotional GPT-4o Experience
Analyzing Cluster 8...
  -> Topic: Adult Content Filtering
Analyzing Cluster 9...
  -> Topic: OpenAI User Trust


✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'


Unnamed: 0,cluster_id,groq_topic_label,source_keywords
0,0,Voice Mode Options,"voice, mode, voice mode, standard voice, advan..."
1,1,AI Image & Video Creation,"image gen, generation, sora, image generation,..."
2,2,Model picker removal,"models, models settings, legacy models, old mo..."
3,3,OpenAI Model Transparency,"openai, models, benchmarks, does openai, opena..."
4,4,AMA Profiles & Comments,"ama, comments, sam, answer questions, jits, an..."
5,5,ChatGPT 32K Context Window,"window, context window, 32k, use, plus users, ..."
6,6,Pro ChatGPT Experience,"chatgpt, models, use, gpt5, work, chat, feels ..."
7,7,Emotional GPT-4o Experience,"4o, gpt4o, model, gpt5, emotional, support, gp..."
8,8,Adult Content Filtering,"chatgpt, filter, content, adults, users, harmf..."
9,9,OpenAI User Trust,"openai, ai, safety, users, models, trust, open..."


### Topic Modelling Using LLama 4 Maverick

In [None]:
import pandas as pd
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from google.colab import userdata
# --- 1. Setup ---
# Load your TF-IDF keyword summary
try:
    df = pd.read_csv('final_bertopic_reranked_summary.csv')
    print("✅ Loaded TF-IDF keyword summary.")
except FileNotFoundError:
    print("❌ Error: 'tfidf_topic_summary.csv' not found.")
    df = None

if df is not None:
    # Initialize the Groq client via LangChain
    llm = ChatGroq(model_name="meta-llama/llama-4-maverick-17b-128e-instruct", groq_api_key=userdata.get("GROQ_API_KEY"))

    # --- 2. Create a Standardized Chain ---
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at analyzing user feedback. Your goal is to synthesize a list of keywords into a concise, descriptive topic label of 5 words or less."),
        ("human", "Here are the keywords for a topic:\n{keywords}\n\nConcise Topic Label:")
    ])
    parser = StrOutputParser()
    chain = prompt | llm | parser

    # --- 3. Loop through keywords and generate topic labels ---
    groq_results = []
    print("\n--- Generating Topic Labels from Keywords with Groq ---")

    for index, row in df.iterrows():
        cluster_id = row['cluster_id']
        keywords = row['reranked_keywords']

        print(f"Analyzing Cluster {cluster_id}...")
        # Invoke the chain with the keywords
        topic_label = chain.invoke({"keywords": keywords})

        groq_results.append({
            'cluster_id': cluster_id,
            'groq_topic_label': topic_label.replace('"', ''),
            'source_keywords': keywords
        })
        print(f"  -> Topic: {topic_label}")

    # --- 4. Save the Final Groq Topics ---
    groq_df = pd.DataFrame(groq_results)
    groq_df.to_csv('llama4Maverick_keyword_summary_rr.csv', index=False)

    print("\n\n✅ Success! Groq topic labels from keywords saved to 'groq_keyword_summary.csv'")
    display(groq_df)

✅ Loaded TF-IDF keyword summary.

--- Generating Topic Labels from Keywords with Groq ---
Analyzing Cluster 0...
  -> Topic: "Voice Modes in ChatGPT" 

Alternatively, other options could be: 
- "ChatGPT Voice Options"
- "Advanced Voice Mode"
- "GPT Voice Modes"

But "Voice Modes in ChatGPT" seems the most concise and descriptive.
Analyzing Cluster 1...
  -> Topic: AI Image and Video Generation
Analyzing Cluster 2...
  -> Topic: Model Selection and Management Options
Analyzing Cluster 3...
  -> Topic: "OpenAI Models and Transparency Issues" 

Let me simplify it further to meet the 5-word limit: "OpenAI Transparency Concerns"
Analyzing Cluster 4...
  -> Topic: Commenting on Profiles and Links
Analyzing Cluster 5...
  -> Topic: ChatGPT Context Window Expansion
Analyzing Cluster 6...
  -> Topic: "Conversing with Advanced AI Models" 

Alternatively, a 5-word or less label could be: "AI Chat Models in Action" or simply "Advanced AI Chat Models". 

However, the most concise 5-word or less lab

Unnamed: 0,cluster_id,groq_topic_label,source_keywords
0,0,"Voice Modes in ChatGPT \n\nAlternatively, othe...","voice, mode, voice mode, standard voice, advan..."
1,1,AI Image and Video Generation,"image gen, generation, sora, image generation,..."
2,2,Model Selection and Management Options,"models, models settings, legacy models, old mo..."
3,3,OpenAI Models and Transparency Issues \n\nLet ...,"openai, models, benchmarks, does openai, opena..."
4,4,Commenting on Profiles and Links,"ama, comments, sam, answer questions, jits, an..."
5,5,ChatGPT Context Window Expansion,"window, context window, 32k, use, plus users, ..."
6,6,Conversing with Advanced AI Models \n\nAlterna...,"chatgpt, models, use, gpt5, work, chat, feels ..."
7,7,Emotional Support from AI Models,"4o, gpt4o, model, gpt5, emotional, support, gp..."
8,8,Content Filtering and Censorship Issues,"chatgpt, filter, content, adults, users, harmf..."
9,9,AI Safety and Trust Issues,"openai, ai, safety, users, models, trust, open..."


In [None]:
import pandas as pd
import os
import time  # NEW: Import the time module
from langchain_groq import ChatGroq
from google.colab import userdata
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import random
# --- 1. Setup ---
df = pd.read_csv('final_labeled_pain_points.csv')
llm = ChatGroq(model_name="meta-llama/llama-4-maverick-17b-128e-instruct", groq_api_key=userdata.get("GROQ_API_KEY"))

# --- 2. Create a Standardized Chain ---
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at analyzing user feedback. Your goal is to find the core pain point from a set of comments and provide a concise, descriptive topic label of 5 words or less."),
    ("human", "Here are the user complaints:\n{comment_sample}\n\nConcise Topic Label:")
])
parser = StrOutputParser()
chain = prompt | llm | parser

# --- 3. Loop, Sample, and Generate Topics ---
llm_results = []
print("\n--- Generating Topic Labels with LangChain & Groq ---")

cluster_ids_to_process = sorted([c for c in df['cluster'].unique() if c != -1])
num_clusters = len(cluster_ids_to_process)

for i, cluster_id in enumerate(cluster_ids_to_process):
    cluster_comments = df[df['cluster'] == cluster_id]['comment_text'].tolist()

    # --- Sample of size 50 for gpt-oss with a longer context window than gemma ---
    sample_size = 20
    # Ensure sample size is not larger than the number of comments in the cluster
    k = min(sample_size, len(cluster_comments))
    sample = random.sample(cluster_comments, k=k)

    formatted_sample = "\n".join([f"- {comment}" for comment in sample])

    print(f"Analyzing Cluster {cluster_id} ({i+1}/{num_clusters}) with a random sample of {k} comments...")
    topic_label = chain.invoke({"comment_sample": formatted_sample})

    llm_results.append({
        'cluster_id': cluster_id,
        'llm_topic_label': topic_label.replace('"', ''),
        'num_comments': len(cluster_comments)
    })
    print(f"  -> Topic: {topic_label}")

    if i < num_clusters - 1:
        print("\nWaiting for 60 seconds...")
        time.sleep(60)

# --- 4. Save the Final LLM Topics ---
llm_df = pd.DataFrame(llm_results)
llm_df.to_csv('llama4-maverick_topic_summary.csv', index=False)
display(llm_df)


--- Generating Topic Labels with LangChain & Groq ---
Analyzing Cluster 0 (1/10) with a random sample of 20 comments...
  -> Topic: "Retain Standard Voice Mode"

Waiting for 60 seconds...
Analyzing Cluster 1 (2/10) with a random sample of 20 comments...
  -> Topic: "Poor Performance in GPT-5"

Alternatively, other options could be:
- "GPT-5 Underperforms"
- "Regression in GPT-5"
- "GPT-5 Issues"
- "Poor GPT-5 Quality"

These labels all have 5 words or less and capture the core pain point from the user comments, which is that GPT-5 is not performing as well as its predecessors or competitors.

Waiting for 60 seconds...
Analyzing Cluster 2 (3/10) with a random sample of 20 comments...
  -> Topic: "Removal of GPT-4o model"

Waiting for 60 seconds...
Analyzing Cluster 3 (4/10) with a random sample of 20 comments...
  -> Topic: "Remove Model Choice"

Waiting for 60 seconds...
Analyzing Cluster 4 (5/10) with a random sample of 20 comments...
  -> Topic: "Perceived Censorship and Moderation 

Unnamed: 0,cluster_id,llm_topic_label,num_comments
0,0,Retain Standard Voice Mode,110
1,1,"Poor Performance in GPT-5\n\nAlternatively, ot...",86
2,2,Removal of GPT-4o model,928
3,3,Remove Model Choice,69
4,4,Perceived Censorship and Moderation Issues \n\...,66
5,5,Context Window Limitation Frustration\n\nThis ...,225
6,6,Loss of Preferred Model,76
7,7,Loss of GPT-4o Option,94
8,8,Overly Strict Content Censorship,69
9,9,AI Safety and Ethics,180
