<a href="https://www.kaggle.com/code/nadaarfaoui/summary-of-amazon-comments?scriptVersionId=289238806" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install rouge-score

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
# ============================================================================
# FAST Review Summarization with BART/T5 + Qwen2.5 Narrative Enhancement
# Optimized for Kaggle - Auto-handles GPTQ and fallback models
# Modified for electronics dataset with review_text column
# INCREMENTAL SAVING: Appends every 100 products to build final file step by step
# ============================================================================

import pandas as pd
import torch
from transformers import (
    BartForConditionalGeneration, BartTokenizer,
    T5ForConditionalGeneration, T5Tokenizer,
    AutoModelForCausalLM, AutoTokenizer
)
from rouge_score import rouge_scorer
from tqdm import tqdm
import warnings
import time
import os
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ============================================================================
# CONFIGURATION
# ============================================================================
SAMPLE_SIZE = None  # Set to None to process all products
USE_NARRATIVE_LLM = True  # Enable Qwen for narrative generation
BATCH_SIZE = 100  # Append to file every 100 products
COMPLETE_RESULTS_FILE = 'complete_results.csv'
FINAL_NARRATIVES_FILE = 'final_narratives.csv'

# Remove existing files to start fresh
if os.path.exists(COMPLETE_RESULTS_FILE):
    os.remove(COMPLETE_RESULTS_FILE)
    print(f"Removed existing {COMPLETE_RESULTS_FILE}")
if os.path.exists(FINAL_NARRATIVES_FILE):
    os.remove(FINAL_NARRATIVES_FILE)
    print(f"Removed existing {FINAL_NARRATIVES_FILE}")

# ============================================================================
# LOAD DATASET
# ============================================================================
print("\nLoading dataset...")
df = pd.read_csv('/kaggle/input/merged-amazon-electronics-dataset/merged_electronics_dataset.csv')

# Handle missing review_text - keep rows but mark them
df['has_review'] = df['review_text'].notna() & (df['review_text'].astype(str).str.strip() != '') & (df['review_text'].astype(str).str.strip() != 'nan')
df['review_text'] = df['review_text'].fillna('')
df['review_text'] = df['review_text'].astype(str)

# Filter out empty reviews for processing
df_with_reviews = df[df['has_review']].copy()
df_without_reviews = df[~df['has_review']].copy()

print(f"Total products: {len(df)}")
print(f"Products with reviews: {len(df_with_reviews)}")
print(f"Products without reviews: {len(df_without_reviews)}")

# ============================================================================
# LOAD PRE-TRAINED SUMMARIZATION MODELS
# ============================================================================
print("\n" + "="*80)
print("LOADING PRE-TRAINED SUMMARIZATION MODELS")
print("="*80)

print("\nLoading pre-trained BART...")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
bart_model.to(device)
bart_model.eval()
print("✓ Pre-trained BART loaded")

print("\nLoading pre-trained T5...")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_model.to(device)
t5_model.eval()
print("✓ Pre-trained T5 loaded")

# ============================================================================
# LOAD QWEN2.5-1.5B-INSTRUCT (AUTO-DETECT BEST VERSION)
# ============================================================================
narrative_model = None
narrative_tokenizer = None

if USE_NARRATIVE_LLM:
    print("\n" + "="*80)
    print("LOADING QWEN2.5-1.5B-INSTRUCT FOR NARRATIVE GENERATION")
    print("="*80)
    
    # Strategy: Try local GPTQ → Try local non-GPTQ → Download from HuggingFace
    
    # Option 1: Try local GPTQ model
    try:
        local_gptq_path = "/kaggle/input/qwen2.5/transformers/1.5b-instruct-gptq-int8/1"
        print(f"Attempting to load local GPTQ model...")
        
        # Try to import optimum
        try:
            import optimum
            print("  ✓ optimum available")
            use_gptq = True
        except ImportError:
            print("  ⚠️ optimum not available, will try standard loading")
            use_gptq = False
        
        narrative_tokenizer = AutoTokenizer.from_pretrained(local_gptq_path, trust_remote_code=True)
        
        if use_gptq:
            narrative_model = AutoModelForCausalLM.from_pretrained(
                local_gptq_path,
                device_map="auto",
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            print("✓ Qwen2.5-1.5B-Instruct loaded (LOCAL GPTQ-Int8 - FASTEST)")
        else:
            # Try loading GPTQ model without optimum (might work for some formats)
            narrative_model = AutoModelForCausalLM.from_pretrained(
                local_gptq_path,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            print("✓ Qwen2.5-1.5B-Instruct loaded (LOCAL float16 - FAST)")
            
    except Exception as e1:
        print(f"  Local GPTQ loading failed: {e1}")
        
        # Option 2: Try other local Qwen paths
        try:
            # Check if there are other Qwen model paths available
            local_paths = [
                "/kaggle/input/qwen2.5/transformers/1.5b-instruct/1",
                "/kaggle/input/qwen-2-5-1-5b-instruct",
                "/kaggle/input/qwen2.5"
            ]
            
            for local_path in local_paths:
                try:
                    print(f"  Trying alternative local path: {local_path}")
                    narrative_tokenizer = AutoTokenizer.from_pretrained(local_path, trust_remote_code=True)
                    narrative_model = AutoModelForCausalLM.from_pretrained(
                        local_path,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        trust_remote_code=True,
                        low_cpu_mem_usage=True
                    )
                    print(f"✓ Qwen2.5-1.5B-Instruct loaded (LOCAL from {local_path})")
                    break
                except:
                    continue
                    
        except Exception as e2:
            print(f"  Alternative local paths failed: {e2}")
        
        # Option 3: Download from HuggingFace (if internet available)
        if narrative_model is None:
            try:
                print("Downloading Qwen2.5-1.5B-Instruct from HuggingFace...")
                hf_model = "Qwen/Qwen2.5-1.5B-Instruct"
                
                narrative_tokenizer = AutoTokenizer.from_pretrained(hf_model)
                narrative_model = AutoModelForCausalLM.from_pretrained(
                    hf_model,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    low_cpu_mem_usage=True
                )
                print("✓ Qwen2.5-1.5B-Instruct loaded (HUGGINGFACE float16)")
                
            except Exception as e3:
                print(f"  HuggingFace download failed: {e3}")
                print("  ⚠️ Could not load any Qwen model")
                narrative_model = None
                narrative_tokenizer = None
    
    if narrative_model is not None:
        narrative_model.eval()
        print("✓ Qwen model ready for narrative generation")
    else:
        print("⚠️ No Qwen model available - will use simple formatting")

# ============================================================================
# PREPARE DATA FOR PROCESSING
# ============================================================================
print("\n" + "="*80)
print("PREPARING DATA FOR PROCESSING")
print("="*80)

if SAMPLE_SIZE is not None:
    df_with_reviews = df_with_reviews.head(SAMPLE_SIZE)
    print(f"\n{'='*80}")
    print(f"⚠️  RUNNING IN TEST MODE: Processing only {SAMPLE_SIZE} products")
    print(f"{'='*80}\n")

print(f"Processing {len(df_with_reviews)} products with reviews")
print(f"Batch size: {BATCH_SIZE} products per save")
print(f"Files will be built incrementally:")
print(f"  - {COMPLETE_RESULTS_FILE}")
print(f"  - {FINAL_NARRATIVES_FILE}")

# ============================================================================
# GENERATION FUNCTIONS
# ============================================================================
def generate_summary(model, tokenizer, text, model_type='bart', max_length=150):
    """Generate summary using pre-trained model"""
    if model_type == 't5':
        text = "summarize: " + text
    
    inputs = tokenizer(
        text,
        max_length=1024,
        truncation=True,
        return_tensors="pt",
        padding=True
    ).to(device)
    
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            min_length=30,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
    
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def clean_and_enhance_summary(raw_summary, product_name, review_rating):
    """Use Qwen2.5 to create professional narrative from raw summary"""
    if narrative_model is None or narrative_tokenizer is None:
        # Fallback: Enhanced formatting without LLM
        return f"Customer feedback ({review_rating}): {raw_summary}"
    
    # Qwen2.5 chat format
    messages = [
        {
            "role": "system",
            "content": "You transform product summaries into natural 2-3 sentence customer feedback narratives."
        },
        {
            "role": "user",
            "content": f"Product: {product_name}\nRating: {review_rating}\nReview: {raw_summary}\n\nWrite a professional customer feedback narrative using phrases like 'The customer mentioned...', 'The reviewer noted...'. Be concise and factual."
        }
    ]
    
    try:
        # Apply chat template
        text = narrative_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        model_device = next(narrative_model.parameters()).device
        inputs = narrative_tokenizer([text], return_tensors="pt", truncation=True, max_length=512).to(model_device)
        
        with torch.no_grad():
            outputs = narrative_model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,
                pad_token_id=narrative_tokenizer.eos_token_id if narrative_tokenizer.eos_token_id else narrative_tokenizer.pad_token_id
            )
        
        # Decode only new tokens
        narrative = narrative_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
        
        # Clean up
        if narrative.lower().startswith(("here", "sure", "of course")):
            sentences = narrative.split('.')
            if len(sentences) > 1:
                narrative = '.'.join(sentences[1:]).strip()
        
        # Fallback if too short
        if len(narrative) < 20:
            return f"Customer feedback ({review_rating}): {raw_summary}"
        
        return narrative
        
    except Exception as e:
        return f"Customer feedback ({review_rating}): {raw_summary}"

def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

def select_best_summary(bart_summary, t5_summary, bart_rouge, t5_rouge):
    """Select best summary based on ROUGE scores"""
    bart_avg = sum(bart_rouge.values()) / 3
    t5_avg = sum(t5_rouge.values()) / 3
    
    if bart_avg >= t5_avg:
        return {'model_name': 'BART', 'summary': bart_summary, 'avg_score': bart_avg, 'rouge_scores': bart_rouge}
    else:
        return {'model_name': 'T5', 'summary': t5_summary, 'avg_score': t5_avg, 'rouge_scores': t5_rouge}

def append_to_files(batch_results, is_first_batch=False):
    """Append batch results to the CSV files"""
    batch_df = pd.DataFrame(batch_results)
    
    # Determine mode and header
    mode = 'w' if is_first_batch else 'a'
    header = is_first_batch
    
    # Append to complete results
    batch_df.to_csv(COMPLETE_RESULTS_FILE, mode=mode, header=header, index=False)
    
    # Append to narratives file
    narratives_df = batch_df[['name', 'main_category', 'sub_category', 'review_rating', 'no_of_ratings', 
                               'discount_price', 'actual_price', 'best_model', 'raw_best_summary', 
                               'cleaned_narrative', 'best_avg_score', 'link']]
    narratives_df.to_csv(FINAL_NARRATIVES_FILE, mode=mode, header=header, index=False)
    
    # Get current file sizes
    complete_size = os.path.getsize(COMPLETE_RESULTS_FILE) / (1024 * 1024)  # MB
    narrative_size = os.path.getsize(FINAL_NARRATIVES_FILE) / (1024 * 1024)  # MB
    
    print(f"  ✓ Appended {len(batch_results)} products")
    print(f"    {COMPLETE_RESULTS_FILE}: {complete_size:.2f} MB")
    print(f"    {FINAL_NARRATIVES_FILE}: {narrative_size:.2f} MB")

# ============================================================================
# GENERATE SUMMARIES AND NARRATIVES (WITH INCREMENTAL SAVING)
# ============================================================================
print("\n" + "="*80)
print("GENERATING SUMMARIES AND NARRATIVES (INCREMENTAL MODE)")
print("="*80)

current_batch = []
total_processed = 0
batch_count = 0
model_selection_count = {'BART': 0, 'T5': 0}
start_total = time.time()
all_rouge_scores = {'bart': [], 't5': []}

for idx, row in tqdm(df_with_reviews.iterrows(), total=len(df_with_reviews), desc="Processing"):
    product_start = time.time()
    review_text = str(row['review_text'])
    
    # Generate summaries
    bart_summary = generate_summary(bart_model, bart_tokenizer, review_text, 'bart')
    t5_summary = generate_summary(t5_model, t5_tokenizer, review_text, 't5')
    
    # Calculate ROUGE
    sentences = review_text.split('.')
    reference = '.'.join(sentences[:5]).strip() + '.'
    
    bart_rouge = calculate_rouge(reference, bart_summary)
    t5_rouge = calculate_rouge(reference, t5_summary)
    
    # Track ROUGE scores
    all_rouge_scores['bart'].append(bart_rouge)
    all_rouge_scores['t5'].append(t5_rouge)
    
    # Select best
    best = select_best_summary(bart_summary, t5_summary, bart_rouge, t5_rouge)
    model_selection_count[best['model_name']] += 1
    
    # Generate narrative
    narrative = clean_and_enhance_summary(best['summary'], row['name'], row['review_rating'])
    
    result = {
        'name': row['name'],
        'main_category': row['main_category'],
        'sub_category': row['sub_category'],
        'image': row['image'],
        'link': row['link'],
        'no_of_ratings': row['no_of_ratings'],
        'discount_price': row['discount_price'],
        'actual_price': row['actual_price'],
        'review_rating': row['review_rating'],
        'original_review_text': review_text,
        'bart_summary': bart_summary,
        'bart_rouge1': bart_rouge['ROUGE-1'],
        'bart_rouge2': bart_rouge['ROUGE-2'],
        'bart_rougeL': bart_rouge['ROUGE-L'],
        'bart_avg_rouge': sum(bart_rouge.values()) / 3,
        't5_summary': t5_summary,
        't5_rouge1': t5_rouge['ROUGE-1'],
        't5_rouge2': t5_rouge['ROUGE-2'],
        't5_rougeL': t5_rouge['ROUGE-L'],
        't5_avg_rouge': sum(t5_rouge.values()) / 3,
        'best_model': best['model_name'],
        'raw_best_summary': best['summary'],
        'best_avg_score': best['avg_score'],
        'cleaned_narrative': narrative,
        'qwen_used': narrative_model is not None,
        'processing_time_seconds': time.time() - product_start
    }
    
    current_batch.append(result)
    total_processed += 1
    
    # Append to files when batch is full
    if len(current_batch) >= BATCH_SIZE:
        print(f"\n{'='*80}")
        print(f"SAVING BATCH {batch_count + 1} ({total_processed - len(current_batch) + 1}-{total_processed})")
        print(f"{'='*80}")
        is_first = (batch_count == 0)
        append_to_files(current_batch, is_first_batch=is_first)
        current_batch = []
        batch_count += 1
    
    # Progress update
    if total_processed % 10 == 0:
        elapsed = time.time() - start_total
        avg_time = elapsed / total_processed
        remaining = (len(df_with_reviews) - total_processed) * avg_time
        print(f"\n  Progress: {total_processed}/{len(df_with_reviews)} | Avg: {avg_time:.1f}s/product | ETA: {remaining/60:.1f} min")

# Save final incomplete batch if any
if len(current_batch) > 0:
    print(f"\n{'='*80}")
    print(f"SAVING FINAL BATCH ({total_processed - len(current_batch) + 1}-{total_processed})")
    print(f"{'='*80}")
    is_first = (batch_count == 0)
    append_to_files(current_batch, is_first_batch=is_first)
    batch_count += 1

# ============================================================================
# HANDLE PRODUCTS WITHOUT REVIEWS
# ============================================================================
print("\n" + "="*80)
print("HANDLING PRODUCTS WITHOUT REVIEWS")
print("="*80)

if len(df_without_reviews) > 0:
    no_review_results = []
    for idx, row in df_without_reviews.iterrows():
        result = {
            'name': row['name'],
            'main_category': row['main_category'],
            'sub_category': row['sub_category'],
            'image': row['image'],
            'link': row['link'],
            'no_of_ratings': row['no_of_ratings'],
            'discount_price': row['discount_price'],
            'actual_price': row['actual_price'],
            'review_rating': row['review_rating'],
            'original_review_text': '',
            'bart_summary': '',
            'bart_rouge1': 0.0,
            'bart_rouge2': 0.0,
            'bart_rougeL': 0.0,
            'bart_avg_rouge': 0.0,
            't5_summary': '',
            't5_rouge1': 0.0,
            't5_rouge2': 0.0,
            't5_rougeL': 0.0,
            't5_avg_rouge': 0.0,
            'best_model': 'N/A',
            'raw_best_summary': '',
            'best_avg_score': 0.0,
            'cleaned_narrative': 'No customer review available for this product.',
            'qwen_used': False,
            'processing_time_seconds': 0.0
        }
        no_review_results.append(result)
    
    print(f"Appending {len(no_review_results)} products without reviews...")
    is_first = (batch_count == 0)
    append_to_files(no_review_results, is_first_batch=is_first)
    total_processed += len(no_review_results)

# ============================================================================
# DISPLAY FINAL METRICS AND SAMPLES
# ============================================================================
print("\n" + "="*80)
print("OVERALL METRICS")
print("="*80)

# Calculate average ROUGE scores
if len(all_rouge_scores['bart']) > 0:
    bart_avg_rouge1 = sum(score['ROUGE-1'] for score in all_rouge_scores['bart']) / len(all_rouge_scores['bart'])
    bart_avg_rouge2 = sum(score['ROUGE-2'] for score in all_rouge_scores['bart']) / len(all_rouge_scores['bart'])
    bart_avg_rougeL = sum(score['ROUGE-L'] for score in all_rouge_scores['bart']) / len(all_rouge_scores['bart'])
    
    t5_avg_rouge1 = sum(score['ROUGE-1'] for score in all_rouge_scores['t5']) / len(all_rouge_scores['t5'])
    t5_avg_rouge2 = sum(score['ROUGE-2'] for score in all_rouge_scores['t5']) / len(all_rouge_scores['t5'])
    t5_avg_rougeL = sum(score['ROUGE-L'] for score in all_rouge_scores['t5']) / len(all_rouge_scores['t5'])
    
    print(f"\nBART: ROUGE-1={bart_avg_rouge1:.4f}, "
          f"ROUGE-2={bart_avg_rouge2:.4f}, "
          f"ROUGE-L={bart_avg_rougeL:.4f}")
    print(f"T5:   ROUGE-1={t5_avg_rouge1:.4f}, "
          f"ROUGE-2={t5_avg_rouge2:.4f}, "
          f"ROUGE-L={t5_avg_rougeL:.4f}")
    
    total_with_reviews = len(all_rouge_scores['bart'])
    print(f"\nBest Model: BART={model_selection_count['BART']} ({model_selection_count['BART']/total_with_reviews*100:.1f}%), "
          f"T5={model_selection_count['T5']} ({model_selection_count['T5']/total_with_reviews*100:.1f}%)")
else:
    print("No products with reviews to calculate metrics")

print(f"\nQwen Enhancement: {'ENABLED' if narrative_model is not None else 'DISABLED (using simple formatting)'}")
print(f"Total products processed: {total_processed}")
print(f"Products with reviews: {len(all_rouge_scores['bart'])}")
print(f"Products without reviews: {len(df_without_reviews)}")
print(f"Total batches saved: {batch_count}")
print(f"Total processing time: {(time.time()-start_total)/60:.1f} min")

# Get final file sizes
complete_size = os.path.getsize(COMPLETE_RESULTS_FILE) / (1024 * 1024)  # MB
narrative_size = os.path.getsize(FINAL_NARRATIVES_FILE) / (1024 * 1024)  # MB

print(f"\n✓ Final files:")
print(f"  {COMPLETE_RESULTS_FILE}: {complete_size:.2f} MB ({total_processed} rows)")
print(f"  {FINAL_NARRATIVES_FILE}: {narrative_size:.2f} MB ({total_processed} rows)")

# Display samples from final file
print("\n" + "="*80)
print("SAMPLE RESULTS FROM FINAL FILE (First 3 with reviews)")
print("="*80)

try:
    # Read first few rows to display
    sample_df = pd.read_csv(COMPLETE_RESULTS_FILE, nrows=min(100, total_processed))
    sample_with_reviews = sample_df[sample_df['bart_summary'] != '']
    
    for idx in range(min(3, len(sample_with_reviews))):
        row = sample_with_reviews.iloc[idx]
        print(f"\n[{idx+1}] {row['name'][:60]}")
        print(f"  Rating: {row['review_rating']} | Price: {row['discount_price']}")
        print(f"  BEST ({row['best_model']}): {row['raw_best_summary'][:100]}...")
        print(f"  NARRATIVE: {row['cleaned_narrative'][:150]}...")
except Exception as e:
    print(f"Could not read sample: {e}")

print(f"\n{'='*80}")
print(f"✓ COMPLETE! {total_processed} products processed in {(time.time()-start_total)/60:.1f} min")
print(f"✓ Files built incrementally and ready to use")
if SAMPLE_SIZE: 
    print(f"⚠️ Set SAMPLE_SIZE=None for full dataset")
print(f"{'='*80}")