In [None]:
import pandas as pd
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer as ClsTokenizer, AutoModel as ClsModel
import joblib


In [7]:
# Loading
df = pd.read_csv("done.csv")

# RoBERTa Classifier
cls_tokenizer = ClsTokenizer.from_pretrained("my_roberta")
cls_model = ClsModel.from_pretrained("my_roberta")
cls_model.eval()

# Clustering Model
kmeans_model = joblib.load("kmeans_model.pkl")

# BART Generation Model
gen_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Cluster Names
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [8]:
def get_cluster(text):
    inputs = cls_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        output = cls_model(**inputs)
    token_embeddings = output.last_hidden_state
    attention_mask = inputs['attention_mask']
    mask_exp = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = torch.sum(token_embeddings * mask_exp, 1)
    summed_mask = torch.clamp(mask_exp.sum(1), min=1e-9)
    mean = (summed / summed_mask).cpu().numpy()
    cluster_num = kmeans_model.predict(mean)[0]
    return cluster_names[cluster_num]


In [9]:
def generate_summary(user_text):
    category = get_cluster(user_text)
    df1 = df[df["cluster"] == category]

    # Get top 3 products by number of 5-star ratings
    top_rated = df1[df1["reviews.rating"] == 5]
    top_3 = top_rated["name"].value_counts().head(3).index.tolist()

    differences = "\n".join([f"- {i+1}. {name}" for i, name in enumerate(top_3)])

    # Extract complaints
    complaints = {}
    negative = df1[(df1["reviews.rating"] <= 2) & (df1["reviews.doRecommend"] == False)]
    for prod in top_3:
        prod_complaints = negative[negative["name"] == prod]["reviews.text"].sample(min(3, negative[negative["name"] == prod].shape[0])).tolist()
        complaints[prod] = " | ".join(prod_complaints)

    # Worst product
    worst_df = df1[df1["reviews.doRecommend"] == False]
    worst_product = worst_df["name"].value_counts().idxmax()
    worst_reasons = worst_df[worst_df["name"] == worst_product]["reviews.text"].sample(min(3, worst_df[worst_df["name"] == worst_product].shape[0])).tolist()

    # Prepare generation prompt
    prompt = f"""
📦 Product Category: {category}

✅ Top 3 Products:
{differences}

🔍 Key Differences:
Explain how these products differ in features, design, or value.

⚠️ Top Complaints:
- {top_3[0]}: {complaints.get(top_3[0], 'No significant complaints')}
- {top_3[1]}: {complaints.get(top_3[1], 'No significant complaints')}
- {top_3[2]}: {complaints.get(top_3[2], 'No significant complaints')}

🚫 Product to Avoid:
{worst_product}
Reasons to avoid:
{" | ".join(worst_reasons) if worst_reasons else 'No significant negative reviews'}
"""

    # Generate summary
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024)
    summary_ids = gen_model.generate(inputs["input_ids"], max_length=300, num_beams=4, early_stopping=True)
    output = gen_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

In [11]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.25.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [16]:
import gradio as gr

gr.Interface(
    fn=generate_summary,
    inputs=gr.Textbox(label='Text'),
    outputs="text",
    title="📝 Smart Review Summary Generator",
    description="The model identifies the product cluster and generates an intelligent summary about the best and worst products in the same category"
).launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1a694ffc34c2f736d8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [14]:
gen_model.save_pretrained("my_bart_summary")
gen_tokenizer.save_pretrained("my_bart_summary")


('my_bart_summary/tokenizer_config.json',
 'my_bart_summary/special_tokens_map.json',
 'my_bart_summary/vocab.json',
 'my_bart_summary/merges.txt',
 'my_bart_summary/added_tokens.json',
 'my_bart_summary/tokenizer.json')

In [15]:
import shutil

shutil.make_archive("my_bart_summary", 'zip', "my_bart_summary")

'/content/my_bart_summary.zip'