# Implementing a bot to discuss and aggregate reviews

In [None]:
pip install transformers



## Loading source DB

In [None]:
### To run on Colab

from google.colab import drive
from google.colab import userdata
import pandas as pd

drive.mount('/content/drive')

# Import secrets

HFtoken = userdata.get('HFtoken')

# Import clustered comments
df = pd.read_csv("/content/drive/Othercomputers/Mon ordinateur portable/Project Review Aggregator/data/kag_comb_clustered.csv")

df.columns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Index(['brand', 'categories', 'id', 'manufacturer', 'name',
       'reviews.doRecommend', 'reviews.id', 'reviews.rating', 'reviews.text',
       'reviews.title', 'reviews.username', 'sentiment', 'clean title',
       'clean review', 'title sentiment', 'review sentiment',
       'title sentiment details', 'review sentiment details',
       'merged PySent sentiment', 'PySent correct', 'classifier src 0',
       'classifier src 1', 'classifier src 2', 'classifier src 3',
       'classifier lemma src 0', 'classifier lemma src 1',
       'classifier lemma src 2', 'classifier lemma src 3', 'cluster',
       'cluster_name'],
      dtype='object')

In [None]:
### To run locally

# import pandas as pd
# import os

# df = pd.read_csv("kag_comb_clustered.csv")

# df.columns

## Creating product catalog

In [None]:
# Define the mapping of cluster numbers to cluster names
cluster_mapping = {
    0: "Amazon tablets",
    1: "Speakers",
    2: "Household accessories",
    3: "Other tablets & accessories",
    4: "Kindles & e-reader",
    5: "Kid and toy accessories"
}

# Map the cluster numbers to their names using the mapping
df['cluster_name'] = df['cluster'].map(cluster_mapping)

In [None]:
# Catalog generator for listing product and classifying by best or worst
import pandas as pd
import ast

# Function to calculate average sentiment based on sentiment details
def calculate_avg_sentiment(sentiment_details):
    """
    Calculate average sentiment from sentiment details in the form of
    "{'label': 'POS', 'probas': {'NEG': ..., 'NEU': ..., 'POS': ...}}"
    """
    sentiment_scores = []

    for detail in sentiment_details:
        if pd.notnull(detail):  # Check for non-null values
            # Convert the string representation to a dictionary
            detail_dict = ast.literal_eval(detail)
            probas = detail_dict.get('probas', {})
            # Calculate sentiment score: POS = 1, NEU = 0, NEG = -1
            sentiment_score = probas.get('POS', 0) - probas.get('NEG', 0)
            sentiment_scores.append(sentiment_score)

    # Return the average sentiment score
    return sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0

# Build the catalog DataFrame
def build_catalog(df_reviews):
    """
    Build the catalog DataFrame using the aggregated data from the reviews DataFrame.
    """
    # Group by 'id' to aggregate data for each unique product
    catalog = df_reviews.groupby('id').agg({
        'name': 'first',  # Assuming each product has a unique name
        'cluster_name': 'first',  # Assuming cluster_name is a column in df_reviews
        'cluster' : 'first',
        'manufacturer': 'first',
        'brand': 'first',
        'reviews.rating': 'mean',  # Average rating for the product
        'title sentiment details': list,  # Collect all title sentiment details
        'review sentiment details': list,  # Collect all review sentiment details
        'id': 'count'  # Count the number of reviews for each product
    }).rename(columns={'id': '# reviews'}).reset_index()

    # Rename columns
    catalog.rename(columns={'reviews.rating': 'avg.review.rating'}, inplace=True)

    # Calculate avg.sentiment.detail using title and review sentiment details
    catalog['avg.sentiment.detail'] = catalog.apply(
        lambda row: (
            calculate_avg_sentiment(row['title sentiment details']) + calculate_avg_sentiment(row['review sentiment details'])
        ) / 2,
        axis=1
    )

    # Drop intermediate sentiment columns if no longer needed
    catalog.drop(columns=['title sentiment details', 'review sentiment details'], inplace=True)

    return catalog

# Assuming df_reviews is your DataFrame
catalog = build_catalog(df_reviews)

# Display the first few rows of the catalog
catalog.head(100)


Unnamed: 0,id,name,cluster_name,cluster,manufacturer,brand,avg.review.rating,# reviews,avg.sentiment.detail
0,AV-ETMhgYSSHbkXwpNb9,All-New Kindle Oasis E-reader - 7 High-Resolut...,Kindles & e-reader,4,Amazon,Amazon,4.750000,4,0.972748
1,AV-EVZITKZqtpbFMSoqc,All-New Kindle Oasis E-reader - 7 High-Resolut...,Kindles & e-reader,4,Amazon,Amazon,4.590909,22,0.751137
2,AV-XeQLWuC1rwyj_gbP5,Fire TV with 4K Ultra HD and Alexa Voice Remo...,Speakers,1,Amazon,Amazon,5.000000,3,0.964128
3,AV1YnR7wglJLPUi8IJmi,"Echo (White),,,\r\nEcho (White),,,",Amazon tablets,0,Amazon,Amazon,4.425876,371,0.761116
4,AV1YnRtnglJLPUi8IJmV,Kindle Paperwhite - eBook reader - 4 GB - 6 m...,Kindles & e-reader,4,Amazon,Amazon,4.772283,3175,0.838382
...,...,...,...,...,...,...,...,...,...
76,AWMjXUGdHh53nbDRJ9LA,Echo (2nd Generation) Smart Assistant Oak Fin...,Speakers,1,Amazon,Amazon,4.333333,3,0.973402
77,AWP6zKOtIwln0LfXnf2p,All-New Kindle Oasis E-reader - 7 High-Resolut...,Kindles & e-reader,4,Amazon,Amazon,4.428571,7,0.934356
78,AWYAV-i9Iwln0LfXqrUq,Echo Spot Pair Kit (Black),Speakers,1,Amazon,Amazon,4.500000,2,0.745316
79,AWdDioCIHh53nbDRScLV,Cat Litter Box Covered Tray Kitten Extra Large...,Kindles & e-reader,4,AmazonBasics,Amazonbasics,5.000000,2,0.928289


## Running Led Base (initial model to test architecure)

In [None]:
from huggingface_hub import login

# Use your token here
login(token=HFtoken)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,LEDForConditionalGeneration
import torch

# Load DistilBART model and tokenizer
# model_name = "sshleifer/distilbart-cnn-12-6" # DistilBart lite
# model_name = "facebook/bart-large-cnn"

# Load the Longformer model and tokenizer
model_name = "allenai/led-base-16384"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Use AutoTokenizer for LED
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LEDForConditionalGeneration.from_pretrained(model_name).to(device)

In [None]:
import random


# Function to generate summaries using LED
def generate_summary_led(model, tokenizer, reviews_text, device):
    max_input_length = 16384  # LED supports very long inputs

    # Tokenize and truncate the input
    tokenized_text = tokenizer(reviews_text, truncation=True, max_length=max_input_length, return_tensors="pt")
    truncated_reviews = tokenizer.decode(tokenized_text['input_ids'][0], skip_special_tokens=True)

    # Debugging: Print token count
    print(f"Tokenized input length (tokens): {len(tokenized_text['input_ids'][0])}")

    # Generate summary
    inputs = tokenizer(truncated_reviews, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    outputs = model.generate(
        **inputs,
        max_length=150,  # Adjust the maximum length of the summary
        num_beams=4,     # Use beam search for better summarization
        early_stopping=True
    )

    # Decode and return the summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary.strip()

# Function to summarize reviews for a product
def summarize_reviews_for_product(product_reviews, tokenizer, model, device):
    # Sample 10 random reviews
    sampled_reviews = product_reviews["reviews.text"].dropna().sample(n=min(100, len(product_reviews)), random_state=42).tolist()

    # Truncate each review to 200 words
    truncated_reviews = [" ".join(review.split()[:2000]) for review in sampled_reviews]

    # Combine all truncated reviews
    reviews_text = " ".join(truncated_reviews)

    # Debugging: Print text and token count
    print(f"Total review text length (characters): {len(reviews_text)}")
    tokenized = tokenizer(reviews_text, return_tensors="pt", truncation=True, max_length=16384)
    print(f"Tokenized input length (tokens): {len(tokenized['input_ids'][0])}")

    # Generate summary using LED
    summary = generate_summary_led(model, tokenizer, reviews_text, device)
    return summary

# Function to summarize top products
def summarize_top_products(catalog_df, df_reviews, top_n=3):
    top_products = sort_top_products(catalog_df, top_n=top_n)
    product_summaries = []

    for _, product in top_products.iterrows():
        product_reviews = df_reviews[df_reviews["id"] == product["id"]]
        summary = summarize_reviews_for_product(product_reviews, tokenizer, model, device)

        product_summaries.append({
            "id": product["id"],
            "name": product["name"],
            "summary": summary
        })

    return pd.DataFrame(product_summaries)

# Function to compute weighted average and sort products
def sort_top_products(catalog_df, top_n=3):
    weight_rating = 0.1
    weight_reviews = 0.5
    weight_sentiment = 0.8

    max_reviews = catalog_df['# reviews'].max()
    max_sentiment = catalog_df['avg.sentiment.detail'].max()

    catalog_df['normalized_reviews'] = catalog_df['# reviews'] / max_reviews
    catalog_df['normalized_sentiment'] = catalog_df['avg.sentiment.detail'] / max_sentiment

    catalog_df['weighted_score'] = (
        weight_rating * catalog_df['avg.review.rating'] +
        weight_reviews * catalog_df['normalized_reviews'] +
        weight_sentiment * catalog_df['normalized_sentiment']
    )

    top_products = catalog_df.sort_values(by='weighted_score', ascending=False).head(top_n)
    catalog_df.drop(columns=['normalized_reviews', 'normalized_sentiment'], inplace=True)
    return top_products

# Example usage
summaries = summarize_top_products(catalog, df_reviews, top_n=3)
print(summaries)

Total review text length (characters): 13115
Tokenized input length (tokens): 2958
Tokenized input length (tokens): 2958
Total review text length (characters): 15628
Tokenized input length (tokens): 3418
Tokenized input length (tokens): 3418
Total review text length (characters): 16555
Tokenized input length (tokens): 3655
Tokenized input length (tokens): 3655
                     id                                               name  \
0  AVphgVaX1cnluZ0-DR74  Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...   
1  AVpfl8cLLJeJML43AE3S                                     Echo ‚Äì White   
2  AV1YnRtnglJLPUi8IJmV   Kindle Paperwhite - eBook reader - 4 GB - 6 m...   

                                             summary  
0  . I love that I completely control the content...  
1  Alexa is extreme;y polite and helpful! Just lo...  
2  good, very much user friendly. I had the origi...  


In [None]:
summaries.head()

Unnamed: 0,id,name,summary
0,AVphgVaX1cnluZ0-DR74,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",. I love that I completely control the content...
1,AVpfl8cLLJeJML43AE3S,Echo ‚Äì White,Alexa is extreme;y polite and helpful! Just lo...
2,AV1YnRtnglJLPUi8IJmV,Kindle Paperwhite - eBook reader - 4 GB - 6 m...,"good, very much user friendly. I had the origi..."


## Running Llama (final model selected)

In [None]:
from huggingface_hub import login

# Use your token here
login(token=HFtoken)

In [None]:
from transformers import pipeline
import torch

# Specify the model
model_id = "meta-llama/Llama-3.2-3B-Instruct"  # Replace with your model

# Initialize the pipeline
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,  # Use mixed precision for efficient GPU usage
    # device_map="auto"          # Automatically maps the model to GPU if available
    device=0
)

# Test the pipeline with a prompt
prompt = "Write a haiku about AI."
output = pipe(prompt, max_new_tokens=50, num_return_sequences=1, temperature=0.7)

# Print the generated text
print("Generated Text:")
print(output[0]["generated_text"])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Text:
Write a haiku about AI. 
Metal minds awake
Learning, adapting, blind
Future's uncertain
Note: A traditional haiku consists of three lines with a syllable count of 5-7-5. I've followed this structure in the writing the haiku above


### Implementation for the whole DB (no cluster)

1.   List item
2.   List item



In [None]:
import random

# Function to generate summaries with prompt engineering and n-shots
def generate_summary(pipe, reviews_text, max_output_length=150):
    # Define n-shots (examples)
    n_shots = """
    Example 1:
    Reviews: "The product is excellent, works perfectly, and the build quality is great."
    Summary: "Mom, I NEED this product. It will help me in school: it is excellent for learning, it will last at least until college and my friend Anne told me it works well."

    Example 2:
    Reviews: "I found the product unreliable, it broke after a week. Customer support was unhelpful."
    Summary: "Timmy's dad has one! I know broke like the next day. But when it worked IT WAS AWESOME. Can we get one? I want one!"

    Example 3:
    Reviews: "Fast delivery and great value for the price! Highly recommended for budget-conscious buyers."
    Summary: "But dad, I want it and it can arrive tomorrow! Plus it's not that expensive... Please, for my birthday, pleeaase!"
    """

    # Construct the prompt with n-shots and the actual reviews
    prompt = f"""
    You are a kid and you want your parents to buy stuff. So you surf the Internet looking for products with lots of positive review and create arguments to explain why they should buy you this item.
    Avoid repeating text.
    Use the following examples to summarize the reviews provided:

    {n_shots}


    ### Actual Reviews:
    {reviews_text}

    ### Summary:
    """

    # Generate summary using the pipeline
    output = pipe(prompt, max_new_tokens=max_output_length, num_return_sequences=1, temperature=0.7)

    # Extract and return the generated summary
    summary = output[0]["generated_text"]
    if "### Summary:" in summary:
        return summary.split("### Summary:")[-1].strip()
    return summary.strip()


# Function to summarize reviews for a product
def summarize_reviews_for_product(product_reviews, pipe, max_output_length=300):
    # Sample up to 10 reviews
    sampled_reviews = product_reviews["reviews.text"].dropna().sample(n=min(50, len(product_reviews)), random_state=16).tolist()

    # Truncate each review to 200 words
    truncated_reviews = [" ".join(review.split()[:200]) for review in sampled_reviews]

    # Combine all truncated reviews
    reviews_text = " ".join(truncated_reviews)

    # Debugging: Print text length
    print(f"Total review text length (characters): {len(reviews_text)}")

    # Generate summary
    summary = generate_summary(pipe, reviews_text, max_output_length)
    return summary


# Function to summarize top products
def summarize_top_products(catalog_df, df_reviews, pipe, top_n=3, max_output_length=150):
    top_products = sort_top_products(catalog_df, top_n=top_n)
    product_summaries = []

    for _, product in top_products.iterrows():
        product_reviews = df_reviews[df_reviews["id"] == product["id"]]
        if product_reviews.empty:
            print(f"No reviews found for product: {product['name']}")
            continue

        summary = summarize_reviews_for_product(product_reviews, pipe, max_output_length)
        product_summaries.append({
            "id": product["id"],
            "name": product["name"],
            "summary": summary
        })

    return pd.DataFrame(product_summaries)


# Function to compute weighted average and sort products
def sort_top_products(catalog_df, top_n=3):
    weight_rating = 0.1
    weight_reviews = 0.5
    weight_sentiment = 0.8

    max_reviews = catalog_df['# reviews'].max()
    max_sentiment = catalog_df['avg.sentiment.detail'].max()

    catalog_df['normalized_reviews'] = catalog_df['# reviews'] / max_reviews
    catalog_df['normalized_sentiment'] = catalog_df['avg.sentiment.detail'] / max_sentiment

    catalog_df['weighted_score'] = (
        weight_rating * catalog_df['avg.review.rating'] +
        weight_reviews * catalog_df['normalized_reviews'] +
        weight_sentiment * catalog_df['normalized_sentiment']
    )

    top_products = catalog_df.sort_values(by='weighted_score', ascending=False).head(top_n)
    catalog_df.drop(columns=['normalized_reviews', 'normalized_sentiment'], inplace=True)
    return top_products


# Example usage
# Assuming `catalog` and `df_reviews` are your DataFrames with product and review data
summaries2 = summarize_top_products(catalog, df_reviews, pipe, top_n=3, max_output_length=150)
summaries2.head()


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 6544


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 7847


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 8792


Unnamed: 0,id,name,summary
0,AVphgVaX1cnluZ0-DR74,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",- 60% of reviewers are happy with the product....
1,AVpfl8cLLJeJML43AE3S,Echo ‚Äì White,"I really want this thing, it's super helpful f..."
2,AV1YnRtnglJLPUi8IJmV,Kindle Paperwhite - eBook reader - 4 GB - 6 m...,"""Mom, I really need this e-reader. It's great ..."


## Final model with Rouge & Bleu

### Generating Kid cluster DF with concatened reviews to export to ChatGPT to summarize

In [None]:
import random

# Function to generate summaries with prompt engineering and n-shots
def generate_summary(pipe, reviews_text, max_output_length=150):
    # Define n-shots (examples)
    n_shots = """
    Example 1:
    Reviews: "The product is excellent, works perfectly, and the build quality is great."
    Summary: "Mom, I NEED this product. It will help me in school: it is excellent for learning, it will last at least until college and my friend Anne told me it works well."

    Example 2:
    Reviews: "I found the product unreliable, it broke after a week. Customer support was unhelpful."
    Summary: "Timmy's dad has one! I know broke like the next day. But when it worked IT WAS AWESOME. Can we get one? I really want one!"

    Example 3:
    Reviews: "Fast delivery and great value for the price! Highly recommended for budget-conscious buyers."
    Summary: "But dad, I want it and it can be delivered tomorrow! Plus it's not that expensive... Please, for my birthday, pleeaase!"
    """

    # Construct the prompt with n-shots and the actual reviews
    prompt = f"""
    You are a kid and you want your parents to buy stuff. So you surf the Internet looking for products with lots of positive review and create arguments to explain why they should buy you this item.
    Avoid repeating text.
    Use the following examples to summarize the reviews provided:

    {n_shots}


    ### Actual Reviews:
    {reviews_text}

    ### Summary:
    """

    # Generate summary using the pipeline
    output = pipe(prompt, max_new_tokens=max_output_length, num_return_sequences=1, temperature=0.5)

    # Extract and return the generated summary
    summary = output[0]["generated_text"]
    if "### Summary:" in summary:
        return summary.split("### Summary:")[-1].strip()
    return summary.strip()

# Function to summarize reviews for a product
def summarize_reviews_for_product(product_reviews, pipe, max_output_length=300):
    # Sample up to 10 reviews
    sampled_reviews = product_reviews["reviews.text"].dropna().sample(n=min(20, len(product_reviews)), random_state=22).tolist()

    # Truncate each review to 200 words
    truncated_reviews = [" ".join(review.split()[:200]) for review in sampled_reviews]

    # Combine all truncated reviews
    reviews_text = " ".join(truncated_reviews)

    # Debugging: Print text length
    print(f"Total review text length (characters): {len(reviews_text)}")

    # Generate summary
    summary = generate_summary(pipe, reviews_text, max_output_length)
    return summary, reviews_text  # Return the concatenated reviews as well

# Function to summarize top products
def summarize_top_products(catalog_df, df_reviews, pipe, top_n=3, max_output_length=150, cluster=None):
    # Filter the catalog by cluster if a cluster is specified
    if cluster is not None:
        catalog_df = catalog_df[catalog_df['cluster'] == cluster].copy()

    # Select top products only from the filtered catalog
    top_products = sort_top_products(catalog_df, top_n=top_n)
    product_summaries = []

    for _, product in top_products.iterrows():
        # Get reviews for the current product
        product_reviews = df_reviews[df_reviews["id"] == product["id"]]
        if product_reviews.empty:
            print(f"No reviews found for product: {product['name']} in cluster {cluster}")
            continue

        # Generate summary and collect concatenated reviews
        summary, concatenated_reviews = summarize_reviews_for_product(product_reviews, pipe, max_output_length)

        # Append results to product summaries
        product_summaries.append({
            "id": product["id"],
            "name": product["name"],
            "summary": summary,
            "reviews_used": concatenated_reviews  # Add concatenated reviews as a new column
        })

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(product_summaries)

# Function to compute weighted average and sort products
def sort_top_products(catalog_df, top_n=3):
    weight_rating = 0.2
    weight_reviews = 0.5
    weight_sentiment = 0.8

    max_reviews = catalog_df['# reviews'].max()
    max_sentiment = catalog_df['avg.sentiment.detail'].max()

    # Create a copy of the DataFrame to avoid modifying the original
    catalog_df = catalog_df.copy()

    catalog_df['normalized_reviews'] = catalog_df['# reviews'] / max_reviews
    catalog_df['normalized_sentiment'] = catalog_df['avg.sentiment.detail'] / max_sentiment

    catalog_df['weighted_score'] = (
        weight_rating * catalog_df['avg.review.rating'] +
        weight_reviews * catalog_df['normalized_reviews'] +
        weight_sentiment * catalog_df['normalized_sentiment']
    )

    top_products = catalog_df.sort_values(by='weighted_score', ascending=False).head(top_n)

    # Drop temporary columns after sorting
    catalog_df.drop(columns=['normalized_reviews', 'normalized_sentiment'], inplace=True)

    return top_products

# Example usage
summaries4 = summarize_top_products(catalog, df_reviews, pipe, top_n=6, max_output_length=150, cluster=5)
summaries4.head()


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 2949


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 2819


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 2859


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 2941


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 2249


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 5733


Unnamed: 0,id,name,summary,reviews_used
0,AVpfw2hvilAPnD_xh0rH,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...","- The product is great for kids, durable, and ...",Great for the kids. Durable. Nice that you don...
1,AVph0EeEilAPnD_x9myq,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...","* ""Mom, can we get this tablet? It's perfect f...",You have the ability to set up on and off time...
2,AVwjfXqqQMlgsOJE8qmm,"All-New Fire HD 8 Kids Edition Tablet, 8 HD Di...","Mom, I really want a tablet for my birthday. T...",This is a great product for kids. The rubber b...
3,AVwjfXp4QMlgsOJE8qmk,"All-New Fire HD 8 Kids Edition Tablet, 8 HD Di...","* ""I really love this product, it's great for ...",Great buy to keep the child occupied during ca...
4,AVqVGWLKnnc1JgDc3jF1,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...","""Mom, I want this tablet because it's great fo...",I bought this item for my 9 year old son. He l...


### Loading annotated test set and launching Bleu & Rouge

In [None]:
!pip install rouge-score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd

# Load the CSV file into a DataFrame
df_assess = pd.read_csv('/content/drive/Othercomputers/Mon ordinateur portable/Project Review Aggregator/data/Chat_GPT _kid_product_summaries.csv')

# Create a ROUGE scorer instance
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Function to calculate BLEU score
def calculate_bleu(reference_summary, generated_summary):
    smoothing_function = SmoothingFunction().method4
    bleu_score = sentence_bleu(
        [reference_summary.split()], generated_summary.split(), smoothing_function=smoothing_function
    )
    return bleu_score

# Function to calculate ROUGE and BLEU for a row
def calculate_rouge_bleu(row):
    try:
        # Calculate ROUGE scores
        scores = scorer.score(row['GPT summary'], row['summary'])
        rouge1_fmeasure = scores['rouge1'].fmeasure
        rougeL_fmeasure = scores['rougeL'].fmeasure

        # Calculate BLEU score
        bleu_score = calculate_bleu(row['summary'], row['GPT summary'])

        return pd.Series({'rouge1': rouge1_fmeasure, 'rougeL': rougeL_fmeasure, 'bleu': bleu_score})
    except Exception as e:
        print(f"Error processing row: {e}")
        return pd.Series({'rouge1': None, 'rougeL': None, 'bleu': None})

# Apply the function to each row of the DataFrame
rouge_bleu_results = df_assess.apply(calculate_rouge_bleu, axis=1)

# Concatenate the results with the original DataFrame
df_assess = pd.concat([df_assess, rouge_bleu_results], axis=1)

# Print the DataFrame with ROUGE and BLEU scores
df_assess




Unnamed: 0,id,name,summary,reviews_used,GPT summary,rouge1,rougeL,bleu
0,AVpfw2hvilAPnD_xh0rH,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",I really need this tablet to do my homework an...,Great for the kids. Durable. Nice that you don...,"Highly praised by users, this item excels in G...",0.165746,0.099448,0.005119
1,AVph0EeEilAPnD_x9myq,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...","- Great for learning, easy to use, and lots of...",You have the ability to set up on and off time...,"Perfect for families or individuals, it offers...",0.27027,0.151351,0.008515
2,AVwjfXqqQMlgsOJE8qmm,"All-New Fire HD 8 Kids Edition Tablet, 8 HD Di...","Mom, I really need this tablet. It's great for...",This is a great product for kids. The rubber b...,"Highly praised by users, this item excels in T...",0.287179,0.184615,0.02098
3,AVwjfXp4QMlgsOJE8qmk,"All-New Fire HD 8 Kids Edition Tablet, 8 HD Di...","Mom, can I PLEEEEEEase get this tablet? It's g...",Great buy to keep the child occupied during ca...,"Innovative and reliable, this product features...",0.241379,0.172414,0.005842
4,AVqVGWLKnnc1JgDc3jF1,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",I need this product because it is easy to use ...,I bought this item for my 9 year old son. He l...,"Perfect for families or individuals, it offers...",0.259067,0.15544,0.011456
5,AVpg3q4RLJeJML43TxA_,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...","Mom, I really want this cover. It looks nice a...",Overall it's a nice looking cover and does it'...,"A remarkable find, this product stands out for...",0.294118,0.176471,0.012493


### Generating for Kindle & Books (cluster with the most issues)

In [None]:
summaries_kindles = summarize_top_products(catalog, df_reviews, pipe, top_n=6, max_output_length=150, cluster=4)
summaries_kindles.head()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 3583


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 1055


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 170


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 1927


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 4328


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Total review text length (characters): 43


Unnamed: 0,id,name,summary,reviews_used
0,AV1YnRtnglJLPUi8IJmV,Kindle Paperwhite - eBook reader - 4 GB - 6 m...,"""I really want a Kindle Paperwhite, Mom. It's ...",I like the Kindle Paperwhite. It's very conven...
1,AVqVGZQBQMlgsOJE6eUb,Kindle Oasis E-reader with Leather Charging Co...,"Mom, I really want this Kindle Oasis. It's so ...","Even when Im outside in bright sunlight, the w..."
2,AWdDioCIHh53nbDRScLV,Cat Litter Box Covered Tray Kitten Extra Large...,Can we get a portable power bank? I need one f...,I replaced my cat's x-large litter box to this...
3,AV-ETMhgYSSHbkXwpNb9,All-New Kindle Oasis E-reader - 7 High-Resolut...,"""Mom, I really want a Kindle Oasis! It has a b...",I absolutely love this reader. The bigger scre...
4,AVphPmHuilAPnD_x3E5h,"- Kindle Voyage - 6"" - 4GB - Black",I really love my Kindle! It's so easy to use a...,"I am really pleased with my new Kindle, I had ..."
