In [None]:
!pip install bertopic
!pip install sentence-transformers

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [None]:
from bertopic import BERTopic

print("importing - past bert")

import pandas as pd
import numpy as np
import re
from hdbscan import HDBSCAN
from pickle import dump, load
import gc
import os
from bertopic.representation import KeyBERTInspired, OpenAI, MaximalMarginalRelevance
# import api_key
import openai
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer

def visualize_topic_scores(topic_scores, topic_names, product_name):
    labels = [topic_names[k] for k in topic_scores.keys()]
    scores = [topic_scores[k] for k in topic_scores.keys()]
    plt.figure(figsize=(12, 7))
    sns.barplot(x=scores, y=labels, palette="coolwarm", orient='h')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
    plt.title(f"Sentiment-Weighted Topic Scores\nProduct: {product_name}")
    plt.xlabel("Normalized Sentiment Score")
    plt.ylabel("Topics")
    plt.tight_layout()
    safe_name = re.sub(r'[^A-Za-z0-9]', '_', product_name)[:30]
    plt.savefig(f"plots/{safe_name}_topic_scores.png")
    plt.close()

def pretty_print(dict_in, names):
    for key in sorted(dict_in.keys()):
        print(f"{key}, {names[key]}: {dict_in[key]}")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

print("all libraries imported")

data = pd.read_csv("input_data.csv")
df = pd.DataFrame(data)

print("data in")

product_dict = {
    re.sub(r'[^A-Za-z0-9 ]', '', product)[:30]: group[['product_name','Rate', 'Summary', 'Sentiment']].reset_index(drop=True)
    for product, group in df.groupby('product_name')
}

print(f"{len(product_dict.keys())}")

filtered_products = {}

for name, curr_product in product_dict.items():
    curr_product = curr_product.dropna(subset=['Summary'])
    curr_product = curr_product[curr_product['Summary'].apply(lambda x: len(str(x).split()) > 10)]

    reviews = curr_product['Summary'].tolist()
    sentiments = curr_product['Sentiment'].tolist()

    if (len(reviews) >= 150) and (len([x for x in sentiments if x == "negative"]) > 100):
        filtered_products[name] = curr_product.copy()

print("filtered data len: " + str(len(filtered_products.keys())))

f = open("product_topics.pkl", "wb")
dump(len(filtered_products.keys()), f)
product_topics = {}
i = 0

key_model = KeyBERTInspired()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')

mmr_model = MaximalMarginalRelevance(diversity=0.3)

client = openai.OpenAI(api_key=key_)
ai_model = OpenAI(client, model="gpt-4o-mini", chat=True)

rep_models = [key_model, mmr_model]

custom_hdbscan = HDBSCAN(min_cluster_size=5, min_samples=2, prediction_data=True)

for name, product in filtered_products.items():
    print("\n  --------------------- \n")
    print(name + " being processed")
    summaries = product['Summary'].tolist()
    sentiments = product['Sentiment'].tolist()
    topic_model = BERTopic(language="english", hdbscan_model=custom_hdbscan, representation_model=rep_models, embedding_model=embedding_model)

    topics, _ = topic_model.fit_transform(summaries)
    topic_model.reduce_topics(summaries, nr_topics="auto")
    topics, _ = topic_model.transform(summaries)

    num_of_pos = sentiments.count("positive")
    num_of_neg = sentiments.count("negative")
    ratings_normalized = [int(x) - 3 for x in product['Rate'].tolist()]

    product['Topic'] = topics
    product_topics[name] = {
        "data": product,
        "model": topic_model
    }

    topic_scores = {}
    for topic in topics:
        topic_scores[topic] = 0

    for summary, topic, sentiment, rating in zip(summaries, topics, sentiments, ratings_normalized):
        #print(f"Summary: {summary[:60]}... → Topic: {topic} → Sentiment: {sentiment}")
        num = num_of_neg if sentiment == "negative" else num_of_pos
        topic_scores[topic] += rating / num

    topic_info = topic_model.get_topic_info()
    topic_names = topic_info.set_index("Topic")["Name"].to_dict()

    pretty_print(topic_scores, topic_names)

    os.makedirs("plots", exist_ok=True)
    visualize_topic_scores(topic_scores, topic_names, name)

    # Save intermediate results
    data = {"product data": product, "topic info": topic_info, "topic names": topic_names, "topic scores": topic_scores}
    dump(data, f)

    # Clear memory
    del topic_model
    gc.collect()

f.close()

importing - past bert
all libraries imported
data in
861
filtered data len: 5


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


  --------------------- 

Butterfly Rapid 750 W Juicer M being processed
-1, -1_motor_grinder_quality_grinding: 0.02624548169102627
0, 0_jars_jar_juicer_mixer: 0.005225522552253833
1, 1_delivery_flipkart_delivered_product: -0.097752632406099
2, 2_blade_blades_sharpen_sharp: -0.12871287128712872

  --------------------- 

Crompton 75 L Desert Air Coole being processed
-1, -1_cooling_cooler_ventilation_air: 0.17556935817805397
0, 0_cooling_cooler_coolers_ventilation: 0.7229813664596259
1, 1_flipkart_flipcart_products_product: -0.10641821946169772
2, 2_pump_motor_roomwater_descriptionhoweverwater: -0.20372670807453414
3, 3_water_pump_drain_dispenser: -0.0712215320910973
4, 4_plastic_quality_cooling_design: 0.01946169772256729
5, 5_defective_flaps_problem_cleanliness: -0.03229813664596273
6, 6_scratches_scratch_cooling_product: -0.0016563146997929622
7, 7_deliveryproduct_delivery_delivered_packing: -0.02070393374741201

  --------------------- 

Mi 5A 80 cm 32 inch HD Ready L being proces

In [None]:
from bertopic import BERTopic

print("importing - past bert")

import pandas as pd
import numpy as np
import re
from hdbscan import HDBSCAN
from pickle import dump, load
import gc
import os
from bertopic.representation import KeyBERTInspired, OpenAI, MaximalMarginalRelevance
# import api_key
import openai
from sentence_transformers import SentenceTransformer

def pretty_print(dict_in, names):
    for key in sorted(dict_in.keys()):
        print(f"{key}, {names[key]}: {dict_in[key]}")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

print("all libraries imported")

data = pd.read_csv("input_data.csv")
df = pd.DataFrame(data)

print("data in")

product_dict = {
    re.sub(r'[^A-Za-z0-9 ]', '', product)[:30]: group[['product_name','Rate', 'Summary', 'Sentiment']].reset_index(drop=True)
    for product, group in df.groupby('product_name')
}

embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')

print(f"{len(product_dict.keys())}")

filtered_products = {}

for name, curr_product in product_dict.items():
    curr_product = curr_product.dropna(subset=['Summary'])
    curr_product = curr_product[curr_product['Summary'].apply(lambda x: len(str(x).split()) > 10)]

    reviews = curr_product['Summary'].tolist()
    sentiments = curr_product['Sentiment'].tolist()

    if (len(reviews) >= 20) and (len([x for x in sentiments if x == "negative"]) > 10):
        filtered_products[name] = curr_product.copy()

print("filtered data len: " + str(len(filtered_products.keys())))

f = open("product_topics.pkl", "wb")
dump(len(filtered_products.keys()), f)
product_topics = {}
i = 0

key_model = KeyBERTInspired()

mmr_model = MaximalMarginalRelevance(diversity=0.3)

client = openai.OpenAI(api_key=key_)
ai_model = OpenAI(client, model="gpt-4o-mini", chat=True)

rep_models = [key_model, mmr_model]

custom_hdbscan = HDBSCAN(min_cluster_size=5, min_samples=2, prediction_data=True)

for name, product in filtered_products.items():
    print("\n  --------------------- \n")
    print(name + " being processed")
    summaries = product['Summary'].tolist()
    sentiments = product['Sentiment'].tolist()
    topic_model = BERTopic(language="english", hdbscan_model=custom_hdbscan, representation_model=rep_models, embedding_model=embedding_model)

    topics, _ = topic_model.fit_transform(summaries)
    topic_model.reduce_topics(summaries, nr_topics="auto")
    topics, _ = topic_model.transform(summaries)

    num_of_pos = sentiments.count("positive")
    num_of_neg = sentiments.count("negative")
    ratings_normalized = [int(x) - 3 for x in product['Rate'].tolist()]

    product['Topic'] = topics
    product_topics[name] = {
        "data": product,
        "model": topic_model
    }

    topic_scores = {}
    for topic in topics:
        topic_scores[topic] = 0

    for summary, topic, sentiment, rating in zip(summaries, topics, sentiments, ratings_normalized):
        #print(f"Summary: {summary[:60]}... → Topic: {topic} → Sentiment: {sentiment}")
        num = num_of_neg if sentiment == "negative" else num_of_pos
        topic_scores[topic] += rating / num

    topic_info = topic_model.get_topic_info()
    topic_names = topic_info.set_index("Topic")["Name"].to_dict()

    pretty_print(topic_scores, topic_names)

    # Save intermediate results
    data = {"product data": product, "topic info": topic_info, "topic names": topic_names, "topic scores": topic_scores}
    dump(data, f)

    # Clear memory
    del topic_model
    gc.collect()

f.close()



importing - past bert
all libraries imported
data in
861
filtered data len: 176

  --------------------- 

AGARO Regal Handheld Vacuum Cl being processed
-1, -1_vacuum_cleaner_cleaners_cleaning: 0.17343304843304838
0, 0_vacuum_cleaner_cleaning_suction: 0.014601139601139373
1, 1_quality_service_review_product: -0.13141025641025655
2, 2_cord_length_useful_vacuum: 0.17307692307692304
3, 3_car_useful_portable_handy: 0.02314814814814815
4, 4_wattsheating_heating_800w_2000w: -0.08048433048433049
5, 5_delivery_packing_packed_delivered: 0.03703703703703704
6, 6_filters_filter_filterhighly_cleaner: -0.00498575498575498
7, 7_quality_satisfied_productnever_product: 0.02849002849002849

  --------------------- 

APPLE 2020 Macbook Air M1  8 G being processed
-1, -1_laptop_performance_macbook_quality: 0.21932173136992408
0, 0_macbook_laptop_m1_mac: 0.31381079875055773
1, 1_laptop_laptops_performance_quality: 0.17257920571173585
2, 2_processor_performance_machine_xampp: 0.13554216867469884
3, 3_flip