<a href="https://colab.research.google.com/github/Akshayk05/shopping_webscrapped/blob/main/RAG_E_commerce_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


In [None]:
!kaggle datasets download paramaggarwal/fashion-product-images-dataset

Dataset URL: https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset
License(s): MIT
^C


In [None]:
df = pd.read_csv('/amazon_review.csv')

In [None]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

sentences = df['reviewText']

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode sentences into embeddings (384-dimensional)
embeddings = model.encode(sentences)

In [None]:
embeddings

array([[-0.0515621 , -0.0049552 , -0.01138962, ..., -0.03003336,
         0.0037141 ,  0.06514838],
       [-0.05087488, -0.01398398,  0.02898018, ...,  0.11257029,
        -0.02828423,  0.04293005],
       [ 0.01495923,  0.00693341,  0.02100714, ..., -0.00273265,
        -0.03117694,  0.04472402],
       ...,
       [-0.00816563,  0.00361438, -0.03908691, ...,  0.07105766,
        -0.04820919,  0.06437703],
       [ 0.01102626, -0.03096953, -0.00782765, ...,  0.01698087,
        -0.08964052,  0.03142858],
       [ 0.04044577,  0.0227167 ,  0.01711593, ..., -0.04567083,
        -0.09873553,  0.10774012]], dtype=float32)

In [None]:
import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import openai
from typing import List, Dict, Tuple, Optional
import os
from PIL import Image
import base64
import io
import json


class MultimodalRAGPipeline:
    def __init__(self,
                 openai_api_key: str,
                 embedding_model_name: str = 'all-MiniLM-L6-v2',
                 top_k: int = 5):
        """
        Initialize the Multimodal RAG Pipeline

        Args:
            openai_api_key: OpenAI API key for LLM generation
            embedding_model_name: SentenceTransformer model for embeddings
            top_k: Number of top documents to retrieve
        """
        self.openai_client = openai.OpenAI(api_key=openai_api_key)
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.top_k = top_k

        # Storage for embeddings and data
        self.review_embeddings = None
        self.review_data = None
        self.product_images = {}  # Dict mapping asin to image paths

    def load_data(self,
                  csv_path: str,
                  embeddings_path: str = None,
                  images_folder: str = None):
        """
        Load review data and embeddings

        Args:
            csv_path: Path to CSV file with review data
            embeddings_path: Path to saved embeddings (optional)
            images_folder: Path to folder containing product images
        """
        # Load review data
        self.review_data = pd.read_csv(csv_path)
        print(f"Loaded {len(self.review_data)} reviews")

        # Load or create embeddings
        if embeddings_path and os.path.exists(embeddings_path):
            with open(embeddings_path, 'rb') as f:
                self.review_embeddings = pickle.load(f)
            print(f"Loaded pre-computed embeddings: {self.review_embeddings.shape}")
        else:
            print("Computing embeddings for review texts...")
            review_texts = self.review_data['reviewText'].fillna('').astype(str)
            self.review_embeddings = self.embedding_model.encode(
                review_texts.tolist(),
                show_progress_bar=True
            )

            # Save embeddings for future use
            if embeddings_path:
                with open(embeddings_path, 'wb') as f:
                    pickle.dump(self.review_embeddings, f)
                print(f"Saved embeddings to {embeddings_path}")

        # Load product images mapping
        if images_folder and os.path.exists(images_folder):
            self._load_product_images(images_folder)

    def _load_product_images(self, images_folder: str):
        """Load mapping of product ASINs to image paths"""
        for filename in os.listdir(images_folder):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                # Assume filename format: asin_index.jpg or asin.jpg
                asin = filename.split('_')[0].split('.')[0]
                if asin not in self.product_images:
                    self.product_images[asin] = []
                self.product_images[asin].append(os.path.join(images_folder, filename))
        print(f"Loaded images for {len(self.product_images)} products")

    def encode_query(self, query: str) -> np.ndarray:
        """
        Encode user query into embedding vector

        Args:
            query: User query string

        Returns:
            Query embedding vector
        """
        return self.embedding_model.encode([query])

    def retrieve_relevant_documents(self,
                                  query: str,
                                  filter_asin: str = None) -> List[Dict]:
        """
        Retrieve top-k relevant documents using similarity search

        Args:
            query: User query string
            filter_asin: Optional ASIN to filter results by specific product

        Returns:
            List of relevant documents with metadata
        """
        # Encode query
        query_embedding = self.encode_query(query)

        # Filter data if asin specified
        if filter_asin:
            mask = self.review_data['asin'] == filter_asin
            filtered_data = self.review_data[mask]
            filtered_embeddings = self.review_embeddings[mask]
        else:
            filtered_data = self.review_data
            filtered_embeddings = self.review_embeddings

        if len(filtered_data) == 0:
            return []

        # Compute cosine similarity
        similarities = cosine_similarity(query_embedding, filtered_embeddings)[0]

        # Get top-k indices
        top_indices = np.argsort(similarities)[::-1][:self.top_k]

        # Prepare retrieved documents
        retrieved_docs = []
        for idx in top_indices:
            doc_data = filtered_data.iloc[idx]

            retrieved_doc = {
                'asin': doc_data['asin'],
                'reviewText': doc_data['reviewText'],
                'summary': doc_data['summary'],
                'overall': doc_data['overall'],
                'reviewerName': doc_data['reviewerName'],
                'helpful_yes': doc_data.get('helpful_yes', 0),
                'total_vote': doc_data.get('total_vote', 0),
                'similarity_score': float(similarities[idx]),
                'images': self.product_images.get(doc_data['asin'], [])
            }
            retrieved_docs.append(retrieved_doc)

        return retrieved_docs

    def encode_image_for_gpt4v(self, image_path: str) -> str:
        """
        Encode image to base64 for GPT-4V

        Args:
            image_path: Path to image file

        Returns:
            Base64 encoded image string
        """
        try:
            with Image.open(image_path) as img:
                # Resize image if too large (optional)
                img.thumbnail((512, 512))

                # Convert to base64
                buffered = io.BytesIO()
                img.save(buffered, format="JPEG")
                img_base64 = base64.b64encode(buffered.getvalue()).decode()
                return img_base64
        except Exception as e:
            print(f"Error encoding image {image_path}: {e}")
            return None

    def generate_answer(self,
                       query: str,
                       retrieved_docs: List[Dict],
                       include_images: bool = True) -> str:
        """
        Generate answer using OpenAI GPT with retrieved context

        Args:
            query: Original user query
            retrieved_docs: Retrieved relevant documents
            include_images: Whether to include images in the prompt

        Returns:
            Generated answer string
        """
        if not retrieved_docs:
            return "I couldn't find relevant information to answer your query."

        # Prepare text context
        context_parts = []
        unique_asins = set()

        for doc in retrieved_docs:
            asin = doc['asin']
            if asin not in unique_asins:
                unique_asins.add(asin)

            context_part = f"""
Product: {asin}
Rating: {doc['overall']}/5
Review Summary: {doc['summary']}
Review: {doc['reviewText']}
Helpfulness: {doc['helpful_yes']}/{doc['total_vote']} found helpful
Similarity Score: {doc['similarity_score']:.3f}
---"""
            context_parts.append(context_part)

        context_text = "\n".join(context_parts)

        # Prepare messages for OpenAI
        messages = [
            {
                "role": "system",
                "content": """You are a knowledgeable shopping assistant. Answer user queries about products based on the provided review context and images.

Guidelines:
- Provide specific, helpful answers based on the review data
- Mention relevant details from reviews (ratings, user experiences)
- If reviews mention specific pros/cons, include them
- Be honest about limitations in the data
- Reference the overall sentiment from reviews
- If images are provided, incorporate visual information when relevant"""
            },
            {
                "role": "user",
                "content": f"""Query: {query}

Review Context:
{context_text}

Based on this information, please provide a comprehensive answer to the user's question."""
            }
        ]

        # Add images if available and requested
        if include_images:
            image_contents = []
            for doc in retrieved_docs[:2]:  # Limit to first 2 docs to avoid token limits
                for img_path in doc['images'][:1]:  # One image per product
                    img_base64 = self.encode_image_for_gpt4v(img_path)
                    if img_base64:
                        image_contents.append({
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{img_base64}"
                            }
                        })

            if image_contents:
                # Use GPT-4V for multimodal response
                messages[-1]["content"] = [
                    {
                        "type": "text",
                        "text": messages[-1]["content"]
                    }
                ] + image_contents

                model = "gpt-4o"  # or "gpt-4-vision-preview"
            else:
                model = "gpt-4o-mini"
        else:
            model = "gpt-4o-mini"

        try:
            response = self.openai_client.chat.completions.create(
                model=model,
                messages=messages,
                max_tokens=500,
                temperature=0.7
            )

            return response.choices[0].message.content

        except Exception as e:
            print(f"Error generating answer: {e}")
            return f"Error generating response: {str(e)}"

    def query(self,
              user_query: str,
              filter_asin: str = None,
              include_images: bool = True) -> Dict:
        """
        Complete RAG pipeline: retrieve and generate answer

        Args:
            user_query: User's question
            filter_asin: Optional product ASIN to filter by
            include_images: Whether to include images in generation

        Returns:
            Dictionary with answer and metadata
        """
        print(f"Processing query: {user_query}")

        # Retrieve relevant documents
        retrieved_docs = self.retrieve_relevant_documents(user_query, filter_asin)

        if not retrieved_docs:
            return {
                'query': user_query,
                'answer': "I couldn't find relevant information to answer your query.",
                'retrieved_docs': [],
                'num_products': 0
            }

        # Generate answer
        answer = self.generate_answer(user_query, retrieved_docs, include_images)

        return {
            'query': user_query,
            'answer': answer,
            'retrieved_docs': retrieved_docs,
            'num_products': len(set(doc['asin'] for doc in retrieved_docs))
        }

# Example usage and testing
def main():
    # Initialize pipeline
    rag_pipeline = MultimodalRAGPipeline(
        openai_api_key="your-openai-api-key-here",
        top_k=5
    )

    # Load data
    rag_pipeline.load_data(
        csv_path="reviews_dataset.csv",
        embeddings_path="review_embeddings.pkl",
        images_folder="product_images/"
    )

    # Example queries for testing
    test_queries = [
        "Are these shoes good for hiking?",
        "Do these headphones work well on flights?",
        "What do people say about battery life?",
        "How is the comfort and fit?",
        "What are the main complaints about this product?",
        "Is this product worth the price?"
    ]

    print("=" * 60)
    print("MULTIMODAL RAG PIPELINE - DEMO")
    print("=" * 60)

    for query in test_queries:
        print(f"\n🔍 Query: {query}")
        print("-" * 40)

        result = rag_pipeline.query(query)

        print(f"📝 Answer: {result['answer']}")
        print(f"📊 Retrieved from {result['num_products']} products")
        print(f"🔗 Based on {len(result['retrieved_docs'])} reviews")

        # Show top retrieved document info
        if result['retrieved_docs']:
            top_doc = result['retrieved_docs'][0]
            print(f"🎯 Top match: ASIN {top_doc['asin']} "
                  f"(similarity: {top_doc['similarity_score']:.3f})")

        print("=" * 60)

if __name__ == "__main__":
    main()

In [None]:
# Multimodal RAG Pipeline - Interactive Demo
# Run this in Jupyter Notebook or Google Colab

# Install required packages
# !pip install sentence-transformers openai scikit-learn pillow pandas numpy

import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import os
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

# ================================
# CONFIGURATION
# ================================

# Set your OpenAI API key here
OPENAI_API_KEY = "your-openai-api-key-here"  # Replace with your actual key

# File paths (adjust according to your setup)
CSV_PATH = "reviews_dataset.csv"
EMBEDDINGS_PATH = "review_embeddings.pkl"
IMAGES_FOLDER = "product_images/"

# ================================
# QUICK SETUP FUNCTIONS
# ================================

def setup_rag_pipeline():
    """Quick setup function"""
    print("🚀 Setting up Multimodal RAG Pipeline...")

    # Initialize OpenAI client
    openai.api_key = OPENAI_API_KEY
    client = openai.OpenAI(api_key=OPENAI_API_KEY)

    # Load sentence transformer for embeddings
    print("📥 Loading embedding model...")
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    return client, embedding_model

def load_review_data(csv_path: str):
    """Load and display review data info"""
    print(f"📊 Loading review data from {csv_path}...")

    df = pd.read_csv(csv_path)
    print(f"✅ Loaded {len(df)} reviews")
    print(f"📦 Products: {df['asin'].nunique()} unique ASINs")
    print(f"📝 Columns: {list(df.columns)}")

    # Show sample data
    print("\n📋 Sample data:")
    display_cols = ['asin', 'overall', 'summary', 'reviewText']
    available_cols = [col for col in display_cols if col in df.columns]
    print(df[available_cols].head(2))

    return df

def create_or_load_embeddings(df: pd.DataFrame,
                             embedding_model,
                             embeddings_path: str):
    """Create embeddings for review texts or load if exists"""

    if os.path.exists(embeddings_path):
        print(f"📂 Loading existing embeddings from {embeddings_path}...")
        with open(embeddings_path, 'rb') as f:
            embeddings = pickle.load(f)
        print(f"✅ Loaded embeddings shape: {embeddings.shape}")
    else:
        print("🔄 Computing embeddings for review texts...")
        review_texts = df['reviewText'].fillna('').astype(str)
        embeddings = embedding_model.encode(
            review_texts.tolist(),
            show_progress_bar=True,
            batch_size=32
        )

        # Save for future use
        print(f"💾 Saving embeddings to {embeddings_path}...")
        with open(embeddings_path, 'wb') as f:
            pickle.dump(embeddings, f)
        print(f"✅ Created and saved embeddings shape: {embeddings.shape}")

    return embeddings

# ================================
# CORE RAG FUNCTIONS
# ================================

def retrieve_documents(query: str,
                      df: pd.DataFrame,
                      embeddings: np.ndarray,
                      embedding_model,
                      top_k: int = 5,
                      filter_asin: str = None):
    """Retrieve relevant documents for a query"""

    print(f"🔍 Searching for: '{query}'")

    # Encode query
    query_embedding = embedding_model.encode([query])

    # Filter by ASIN if specified
    if filter_asin:
        mask = df['asin'] == filter_asin
        filtered_df = df[mask].copy()
        filtered_embeddings = embeddings[mask]
        print(f"🎯 Filtering by ASIN: {filter_asin} ({len(filtered_df)} reviews)")
    else:
        filtered_df = df.copy()
        filtered_embeddings = embeddings

    if len(filtered_df) == 0:
        print("❌ No documents found matching criteria")
        return []

    # Compute similarities
    similarities = cosine_similarity(query_embedding, filtered_embeddings)[0]

    # Get top-k
    top_indices = np.argsort(similarities)[::-1][:top_k]

    # Prepare results
    results = []
    for i, idx in enumerate(top_indices):
        doc = filtered_df.iloc[idx]
        result = {
            'rank': i + 1,
            'asin': doc['asin'],
            'reviewText': doc['reviewText'][:200] + "...",  # Truncate for display
            'full_reviewText': doc['reviewText'],
            'summary': doc.get('summary', 'N/A'),
            'overall': doc['overall'],
            'similarity': float(similarities[idx]),
            'helpful_yes': doc.get('helpful_yes', 0),
            'total_vote': doc.get('total_vote', 0)
        }
        results.append(result)

    print(f"📋 Retrieved {len(results)} relevant documents")
    return results

def generate_answer(query: str,
                   retrieved_docs: List[Dict],
                   openai_client):
    """Generate answer using retrieved context"""

    if not retrieved_docs:
        return "❌ No relevant information found to answer your query."

    # Prepare context
    context_parts = []
    for doc in retrieved_docs:
        context_part = f"""
Product: {doc['asin']}
Rating: {doc['overall']}/5 stars
Review Summary: {doc['summary']}
Review: {doc['full_reviewText']}
Helpfulness: {doc['helpful_yes']}/{doc['total_vote']} found helpful
Relevance Score: {doc['similarity']:.3f}
---"""
        context_parts.append(context_part)

    context = "\n".join(context_parts)

    # Create prompt
    prompt = f"""You are a helpful shopping assistant. Answer the user's question based on the product review information provided.

User Question: {query}

Product Review Context:
{context}

Instructions:
- Provide a specific, helpful answer based on the reviews
- Mention relevant ratings and user experiences
- Highlight both positive and negative aspects if mentioned
- Be honest about what the reviews do/don't cover
- Keep your response concise but informative

Answer:"""

    try:
        print("🤖 Generating answer with OpenAI...")
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",  # Use gpt-4o for better results if available
            messages=[
                {"role": "system", "content": "You are a knowledgeable shopping assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=400,
            temperature=0.7
        )

        return response.choices[0].message.content

    except Exception as e:
        print(f"❌ Error generating answer: {e}")
        return f"Error: Could not generate answer - {str(e)}"

def complete_rag_query(query: str,
                      df: pd.DataFrame,
                      embeddings: np.ndarray,
                      embedding_model,
                      openai_client,
                      top_k: int = 5,
                      filter_asin: str = None):
    """Complete RAG pipeline"""

    print("=" * 60)
    print(f"🎯 QUERY: {query}")
    print("=" * 60)

    # Step 1: Retrieve
    retrieved_docs = retrieve_documents(
        query, df, embeddings, embedding_model, top_k, filter_asin
    )

    if not retrieved_docs:
        return {"query": query, "answer": "No relevant information found.", "docs": []}

    # Step 2: Show retrieved docs
    print("\n📋 TOP RETRIEVED DOCUMENTS:")
    for doc in retrieved_docs[:3]:  # Show top 3
        print(f"\n#{doc['rank']} - Product {doc['asin']} | "
              f"Rating: {doc['overall']}/5 | "
              f"Similarity: {doc['similarity']:.3f}")
        print(f"Preview: {doc['reviewText']}")

    # Step 3: Generate answer
    print(f"\n🤖 GENERATING ANSWER...")
    answer = generate_answer(query, retrieved_docs, openai_client)

    print(f"\n💡 ANSWER:")
    print(answer)

    return {
        "query": query,
        "answer": answer,
        "docs": retrieved_docs,
        "num_products": len(set(doc['asin'] for doc in retrieved_docs))
    }

# ================================
# MAIN EXECUTION
# ================================

# Initialize pipeline
client, embedding_model = setup_rag_pipeline()

# Load data
df = load_review_data(CSV_PATH)

# Create/load embeddings
embeddings = create_or_load_embeddings(df, embedding_model, EMBEDDINGS_PATH)

print("\n🎉 Pipeline ready! You can now run queries.")

# ================================
# EXAMPLE QUERIES
# ================================

# Test queries
test_queries = [
    "Are these shoes good for hiking?",
    "Do these headphones work well on flights?",
    "What do people say about battery life?",
    "How is the comfort and fit?",
    "Is this product durable?",
    "What are the main complaints?"
]

print(f"\n🧪 RUNNING EXAMPLE QUERIES...")
print("=" * 60)

# Run a few example queries
for query in test_queries[:3]:  # Run first 3 queries
    result = complete_rag_query(
        query=query,
        df=df,
        embeddings=embeddings,
        embedding_model=embedding_model,
        openai_client=client,
        top_k=5
    )
    print("\n" + "="*60 + "\n")

print("🏁 Demo completed!")
print("\n💡 To run your own queries, use:")
print("complete_rag_query('Your question here?', df, embeddings, embedding_model, client)")

# ================================
# INTERACTIVE SECTION
# ================================

def interactive_query():
    """Interactive query function for Jupyter"""
    user_query = input("\n🔍 Enter your question about the products: ")

    # Optional: filter by specific product
    filter_choice = input("🎯 Filter by specific product ASIN? (press enter to skip): ").strip()
    filter_asin = filter_choice if filter_choice else None

    result = complete_rag_query(
        query=user_query,
        df=df,
        embeddings=embeddings,
        embedding_model=embedding_model,
        openai_client=client,
        filter_asin=filter_asin
    )

    return result

# Uncomment the line below to enable interactive mode
# interactive_query()

In [None]:
!pip install pandas>=1.5.0 numpy>=1.21.0 sentence-transformers>=2.2.0 chromadb>=0.4.0 tqdm>=4.64.0 openpyxl>=3.0.10


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import json
from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm
import os
from datetime import datetime
import hashlib

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ReviewEmbeddingSystem:
    """
    A comprehensive system for extracting embeddings from product reviews
    and storing them in ChromaDB for RAG applications.
    """

    def __init__(self,
                 model_name: str = "all-MiniLM-L6-v2",
                 collection_name: str = "product_reviews",
                 persist_directory: str = "./chroma_db"):
        """
        Initialize the embedding system.

        Args:
            model_name: Name of the sentence transformer model
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector database
        """
        self.model_name = model_name
        self.collection_name = collection_name
        self.persist_directory = persist_directory

        # Initialize sentence transformer model
        logger.info(f"Loading sentence transformer model: {model_name}")
        self.model = SentenceTransformer(model_name)

        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(path=persist_directory)

        # Create or get collection
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"description": "Product reviews embeddings for RAG"}
        )

        logger.info(f"Initialized ChromaDB collection: {collection_name}")

    def load_reviews_data(self, file_path: str) -> pd.DataFrame:
        """
        Load reviews data from various file formats.

        Args:
            file_path: Path to the reviews dataset file

        Returns:
            pandas DataFrame containing the reviews data
        """
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.json'):
                df = pd.read_json(file_path)
            elif file_path.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(file_path)
            else:
                raise ValueError("Unsupported file format. Use CSV, JSON, or Excel files.")

            logger.info(f"Loaded {len(df)} reviews from {file_path}")
            return df

        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def preprocess_reviews(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the reviews data for embedding generation.

        Args:
            df: Raw DataFrame containing reviews

        Returns:
            Preprocessed DataFrame
        """
        # Create a copy to avoid modifying original data
        processed_df = df.copy()

        # Handle missing values
        processed_df['Review Text'] = processed_df['Review Text'].fillna('')
        processed_df['Summary'] = processed_df['Summary'].fillna('')
        processed_df['Reviewer Name'] = processed_df['Reviewer Name'].fillna('Anonymous')

        # Create combined text for embedding (review text + summary)
        processed_df['combined_text'] = (
            processed_df['Summary'].astype(str) + " " +
            processed_df['Review Text'].astype(str)
        ).str.strip()

        # Filter out empty reviews
        processed_df = processed_df[processed_df['combined_text'].str.len() > 0]

        # Create unique document IDs
        processed_df['doc_id'] = processed_df.apply(
            lambda row: hashlib.md5(
                f"{row['Reviewer ID']}_{row['ASIN']}_{row['Unix Review Time']}"
                .encode()
            ).hexdigest(),
            axis=1
        )

        logger.info(f"Preprocessed {len(processed_df)} reviews")
        return processed_df

    def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of text strings to embed
            batch_size: Batch size for processing

        Returns:
            NumPy array of embeddings
        """
        logger.info(f"Generating embeddings for {len(texts)} texts")

        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch = texts[i:i + batch_size]
            batch_embeddings = self.model.encode(batch, show_progress_bar=False)
            embeddings.extend(batch_embeddings)

        return np.array(embeddings)

    def prepare_metadata(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """
        Prepare metadata for each review.

        Args:
            df: DataFrame containing review data

        Returns:
            List of metadata dictionaries
        """
        metadata_list = []

        for _, row in df.iterrows():
            metadata = {
                "reviewer_id": str(row['Reviewer ID']),
                "asin": str(row['ASIN']),
                "reviewer_name": str(row['Reviewer Name']),
                "helpful_votes": int(row['Helpful']) if pd.notna(row['Helpful']) else 0,
                "overall_rating": float(row['Overall Rating']) if pd.notna(row['Overall Rating']) else 0.0,
                "summary": str(row['Summary']),
                "unix_review_time": int(row['Unix Review Time']) if pd.notna(row['Unix Review Time']) else 0,
                "review_time": str(row['Review Time']),
                "day_difference": int(row['Day Difference']) if pd.notna(row['Day Difference']) else 0,
                "helpful_yes": int(row['Helpful Yes']) if pd.notna(row['Helpful Yes']) else 0,
                "total_votes": int(row['Total Votes']) if pd.notna(row['Total Votes']) else 0,
                "text_length": len(str(row['Review Text'])),
                "combined_text_length": len(str(row['combined_text']))
            }
            metadata_list.append(metadata)

        return metadata_list

    def store_embeddings(self,
                        df: pd.DataFrame,
                        embeddings: np.ndarray,
                        batch_size: int = 100) -> None:
        """
        Store embeddings and metadata in ChromaDB.

        Args:
            df: DataFrame containing review data
            embeddings: NumPy array of embeddings
            batch_size: Batch size for storing data
        """
        logger.info("Storing embeddings in ChromaDB")

        # Prepare data for storage
        documents = df['combined_text'].tolist()
        ids = df['doc_id'].tolist()
        metadata_list = self.prepare_metadata(df)

        # Store in batches
        for i in tqdm(range(0, len(documents), batch_size), desc="Storing embeddings"):
            end_idx = min(i + batch_size, len(documents))

            batch_documents = documents[i:end_idx]
            batch_ids = ids[i:end_idx]
            batch_embeddings = embeddings[i:end_idx].tolist()
            batch_metadata = metadata_list[i:end_idx]

            self.collection.add(
                documents=batch_documents,
                embeddings=batch_embeddings,
                metadatas=batch_metadata,
                ids=batch_ids
            )

        logger.info(f"Successfully stored {len(documents)} embeddings")

    def process_and_store(self,
                         file_path: str,
                         batch_size: int = 32,
                         store_batch_size: int = 100) -> Dict[str, Any]:
        """
        Complete pipeline to process reviews and store embeddings.

        Args:
            file_path: Path to the reviews dataset
            batch_size: Batch size for embedding generation
            store_batch_size: Batch size for storing in database

        Returns:
            Dictionary with processing statistics
        """
        start_time = datetime.now()

        # Load and preprocess data
        df = self.load_reviews_data(file_path)
        processed_df = self.preprocess_reviews(df)

        # Generate embeddings
        embeddings = self.generate_embeddings(
            processed_df['combined_text'].tolist(),
            batch_size=batch_size
        )

        # Store in vector database
        self.store_embeddings(processed_df, embeddings, store_batch_size)

        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()

        stats = {
            "total_reviews_processed": len(processed_df),
            "embedding_dimension": embeddings.shape[1],
            "processing_time_seconds": processing_time,
            "model_used": self.model_name,
            "collection_name": self.collection_name
        }

        logger.info(f"Processing completed in {processing_time:.2f} seconds")
        return stats

    def search_similar_reviews(self,
                              query: str,
                              n_results: int = 5,
                              filter_metadata: Optional[Dict] = None) -> Dict[str, Any]:
        """
        Search for similar reviews based on a query.

        Args:
            query: Search query text
            n_results: Number of results to return
            filter_metadata: Optional metadata filters

        Returns:
            Search results dictionary
        """
        # Generate query embedding
        query_embedding = self.model.encode([query])[0].tolist()

        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results,
            where=filter_metadata
        )

        return {
            "query": query,
            "results": results,
            "n_results": len(results['ids'][0]) if results['ids'] else 0
        }

    def get_collection_stats(self) -> Dict[str, Any]:
        """
        Get statistics about the stored collection.

        Returns:
            Dictionary with collection statistics
        """
        count = self.collection.count()

        return {
            "collection_name": self.collection_name,
            "total_documents": count,
            "model_used": self.model_name,
            "persist_directory": self.persist_directory
        }

# RAG Query System
class ProductReviewRAG:
    """
    RAG system for querying product reviews.
    """

    def __init__(self, embedding_system: ReviewEmbeddingSystem):
        self.embedding_system = embedding_system

    def query_reviews(self,
                     question: str,
                     n_context: int = 5,
                     rating_filter: Optional[float] = None,
                     product_filter: Optional[str] = None) -> Dict[str, Any]:
        """
        Query the review database and prepare context for RAG.

        Args:
            question: User question
            n_context: Number of context reviews to retrieve
            rating_filter: Filter by minimum rating
            product_filter: Filter by specific ASIN

        Returns:
            RAG response with context and answer
        """
        # Build metadata filter
        metadata_filter = {}
        if rating_filter:
            metadata_filter["overall_rating"] = {"$gte": rating_filter}
        if product_filter:
            metadata_filter["asin"] = product_filter

        # Search for relevant reviews
        search_results = self.embedding_system.search_similar_reviews(
            query=question,
            n_results=n_context,
            filter_metadata=metadata_filter if metadata_filter else None
        )

        # Prepare context
        context_reviews = []
        if search_results['results']['documents']:
            for i, doc in enumerate(search_results['results']['documents'][0]):
                metadata = search_results['results']['metadatas'][0][i]
                context_reviews.append({
                    "review_text": doc,
                    "rating": metadata.get("overall_rating", "N/A"),
                    "product": metadata.get("asin", "N/A"),
                    "reviewer": metadata.get("reviewer_name", "Anonymous"),
                    "helpful_votes": metadata.get("helpful_votes", 0)
                })

        return {
            "question": question,
            "context_reviews": context_reviews,
            "n_context_used": len(context_reviews),
            "filters_applied": metadata_filter
        }

# Example usage and testing functions
def main_example():
    """
    Example usage of the embedding system.
    """
    # Initialize the system
    embedding_system = ReviewEmbeddingSystem(
        model_name="all-MiniLM-L6-v2",  # Fast and efficient model
        collection_name="product_reviews_v1",
        persist_directory="./review_embeddings_db"
    )

    # Process reviews (replace with your actual file path)
    file_path = "your_reviews_dataset.csv"

    try:
        # Process and store embeddings
        stats = embedding_system.process_and_store(file_path)
        print("Processing Statistics:")
        print(json.dumps(stats, indent=2))

        # Get collection statistics
        collection_stats = embedding_system.get_collection_stats()
        print("\nCollection Statistics:")
        print(json.dumps(collection_stats, indent=2))

        # Example search
        search_results = embedding_system.search_similar_reviews(
            query="product quality and durability",
            n_results=3
        )
        print(f"\nFound {search_results['n_results']} similar reviews")

        # Initialize RAG system
        rag_system = ProductReviewRAG(embedding_system)

        # Example RAG query
        rag_response = rag_system.query_reviews(
            question="What do customers say about product quality?",
            n_context=5,
            rating_filter=4.0  # Only reviews with 4+ stars
        )

        print(f"\nRAG Response for: {rag_response['question']}")
        print(f"Found {rag_response['n_context_used']} relevant reviews")

    except Exception as e:
        logger.error(f"Error in main example: {str(e)}")

if __name__ == "__main__":
    main_example()

# Installation requirements (save as requirements.txt):
"""
pandas>=1.5.0
numpy>=1.21.0
sentence-transformers>=2.2.0
chromadb>=0.4.0
tqdm>=4.64.0
openpyxl>=3.0.10
"""

ERROR:__main__:Error loading data: [Errno 2] No such file or directory: 'your_reviews_dataset.csv'
ERROR:__main__:Error in main example: [Errno 2] No such file or directory: 'your_reviews_dataset.csv'


'\npandas>=1.5.0\nnumpy>=1.21.0\nsentence-transformers>=2.2.0\nchromadb>=0.4.0\ntqdm>=4.64.0\nopenpyxl>=3.0.10\n'

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Sample AI/ML-related sentences
sentences = df['Reviews']

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode sentences into embeddings (384-dimensional)
embeddings = model.encode(sentences)

# Installing Dependencies

In [None]:
# Installing dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/amazon_shoes_data.csv')
df.shape

(73, 5)

In [None]:
df.head()

Unnamed: 0,Name,Price,Reviews,Rating,Last Download
0,,775,104128,3.9 out of 5 stars,50+ bought in past month
1,,1519,98,3.6 out of 5 stars,100+ bought in past month
2,,1519,354,3.4 out of 5 stars,M.R.P:
3,,498,5,4.0 out of 5 stars,50+ bought in past month
4,,799,825,3.4 out of 5 stars,700+ bought in past month


In [None]:
df.drop('Name', axis=1, inplace=True)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Sample AI/ML-related sentences
sentences = df['Reviews']

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode sentences into embeddings (384-dimensional)
embeddings = model.encode(sentences)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

TypeError: 'float' object is not subscriptable

In [None]:
import random

products = []
for i in range(50):
    product = {
        "title": f"Product {i}",
        "brand": random.choice(["boAt", "Noise", "Sony", "Realme"]),
        "price": random.randint(500, 5000),
        "category": random.choice(["Earphones", "Headphones", "TWS", "Speakers"]),
        "battery_life": random.choice(["6hr", "8hr", "10hr", "12hr"]),
        "connectivity": random.choice(["Bluetooth 5.0", "Bluetooth 4.2", "Wired"]),
        "noise_cancellation": random.choice(["Yes", "No"]),
        "color": random.choice(["Black", "White", "Red", "Blue"]),
    }

    # Add 42 more dummy features
    for j in range(8, 50):
        product[f"feature_{j}"] = random.choice(["Yes", "No"])

    products.append(product)


In [None]:
print(products)

[{'title': 'Product 0', 'brand': 'Noise', 'price': 4276, 'category': 'Headphones', 'battery_life': '6hr', 'connectivity': 'Wired', 'noise_cancellation': 'Yes', 'color': 'Red', 'feature_8': 'No', 'feature_9': 'No', 'feature_10': 'Yes', 'feature_11': 'No', 'feature_12': 'No', 'feature_13': 'Yes', 'feature_14': 'Yes', 'feature_15': 'No', 'feature_16': 'No', 'feature_17': 'Yes', 'feature_18': 'No', 'feature_19': 'No', 'feature_20': 'Yes', 'feature_21': 'No', 'feature_22': 'No', 'feature_23': 'Yes', 'feature_24': 'No', 'feature_25': 'No', 'feature_26': 'No', 'feature_27': 'No', 'feature_28': 'Yes', 'feature_29': 'No', 'feature_30': 'Yes', 'feature_31': 'No', 'feature_32': 'Yes', 'feature_33': 'Yes', 'feature_34': 'Yes', 'feature_35': 'No', 'feature_36': 'Yes', 'feature_37': 'No', 'feature_38': 'No', 'feature_39': 'Yes', 'feature_40': 'Yes', 'feature_41': 'No', 'feature_42': 'No', 'feature_43': 'Yes', 'feature_44': 'Yes', 'feature_45': 'No', 'feature_46': 'Yes', 'feature_47': 'No', 'feature_

In [None]:
pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

def get_product_embedding(product):
    text = f"{product['title']} {product['brand']} {product['category']} {product['price']} {product['battery_life']} {product['connectivity']}"
    return model.encode(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# AMAZON REVIEWS DATASET - Review Embeddings

In [None]:
import faiss

dimension = 384  # for MiniLM
index = faiss.IndexFlatL2(dimension)

# Build index
product_vectors = []
for p in products:
    emb = get_product_embedding(p)
    product_vectors.append(emb)

index.add(np.array(product_vectors).astype('float32'))

In [None]:
def search_products(query, top_k=5):
    query_emb = model.encode(query)
    D, I = index.search(np.array([query_emb]).astype('float32'), top_k)
    return [products[i] for i in I[0]]

In [None]:
def generate_answer(query):
    matches = search_products(query)
    response = f"Top results for '{query}':\n"
    for p in matches:
        response += f"\n- {p['title']} ({p['brand']}, ₹{p['price']}) - {p['category']}, Battery: {p['battery_life']}"
    return response

In [None]:
import random
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 1: Simulate 50-feature product data
products = []
for i in range(50):
    product = {
        "title": f"Product {i}",
        "brand": random.choice(["boAt", "Noise", "Sony", "Realme"]),
        "price": random.randint(500, 5000),
        "category": random.choice(["Earphones", "Headphones", "TWS", "Speakers"]),
        "battery_life": random.choice(["6hr", "8hr", "10hr", "12hr"]),
        "connectivity": random.choice(["Bluetooth 5.0", "Bluetooth 4.2", "Wired"]),
        "noise_cancellation": random.choice(["Yes", "No"]),
        "color": random.choice(["Black", "White", "Red", "Blue"]),
    }
    # Add remaining 42 features
    for j in range(8, 50):
        product[f"feature_{j}"] = random.choice(["Yes", "No"])
    products.append(product)

# Step 2: Load embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Generate embeddings
print("Generating product embeddings...")
product_vectors = []
for p in products:
    text = f"{p['title']} {p['brand']} {p['category']} ₹{p['price']} battery:{p['battery_life']} connect:{p['connectivity']}"
    emb = model.encode(text)
    product_vectors.append(emb)

product_vectors = np.array(product_vectors).astype('float32')

# Step 4: Build FAISS index
dimension = product_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(product_vectors)

# Step 5: Search function
def search_products(query, top_k=5):
    query_vec = model.encode(query)
    D, I = index.search(np.array([query_vec]).astype('float32'), top_k)
    return [products[i] for i in I[0]]

# Step 6: Generate and print answer
def generate_answer(query):
    matches = search_products(query)
    response = f"\n🔍 Results for query: '{query}'\n"
    for i, p in enumerate(matches):
        response += f"\n{i+1}. {p['title']} ({p['brand']})\n"
        response += f"   Category: {p['category']} | Price: ₹{p['price']} | Battery: {p['battery_life']} | Connectivity: {p['connectivity']}\n"
    return response

# Step 7: Run a test query
if __name__ == "__main__":
    user_query = input("💬 Enter your product query: ")
    output = generate_answer(user_query)
    print(output)


Loading embedding model...
Generating product embeddings...
💬 Enter your product query: wireless earphone under 1000

🔍 Results for query: 'wireless earphone under 1000'

1. Product 10 (Noise)
   Category: Headphones | Price: ₹2359 | Battery: 12hr | Connectivity: Bluetooth 5.0

2. Product 9 (Realme)
   Category: Earphones | Price: ₹4285 | Battery: 10hr | Connectivity: Bluetooth 5.0

3. Product 45 (Realme)
   Category: Earphones | Price: ₹1554 | Battery: 12hr | Connectivity: Bluetooth 5.0

4. Product 12 (Realme)
   Category: Earphones | Price: ₹2624 | Battery: 10hr | Connectivity: Bluetooth 5.0

5. Product 7 (Sony)
   Category: Headphones | Price: ₹1088 | Battery: 10hr | Connectivity: Bluetooth 5.0



In [None]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Any, Optional
import logging
from tqdm import tqdm
import os
from datetime import datetime
import hashlib
import warnings
import sys
warnings.filterwarnings("ignore")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Try to import required packages with fallback options
def setup_dependencies():
    """Setup and verify all dependencies with helpful error messages."""
    missing_packages = []

    # Check basic packages
    try:
        import pandas as pd
        import numpy as np
        print("✅ pandas and numpy are available")
    except ImportError as e:
        missing_packages.append("pandas numpy")
        print(f"❌ Error importing pandas/numpy: {e}")

    # Check sentence-transformers with specific version fix
    try:
        from sentence_transformers import SentenceTransformer
        print("✅ sentence-transformers is available")
        return True, None
    except ImportError as e:
        print(f"❌ sentence-transformers import error: {e}")
        missing_packages.append("sentence-transformers")
        return False, missing_packages

    # Check ChromaDB
    try:
        import chromadb
        print("✅ chromadb is available")
    except ImportError as e:
        missing_packages.append("chromadb")
        print(f"❌ chromadb import error: {e}")

    if missing_packages:
        return False, missing_packages
    return True, None

def fix_dependencies():
    """Provide instructions to fix dependency issues."""
    print("\n🔧 DEPENDENCY FIX INSTRUCTIONS")
    print("=" * 50)
    print("The error is caused by conflicting urllib3 versions.")
    print("Please run these commands in order:\n")

    print("1️⃣ Uninstall conflicting packages:")
    print("   pip uninstall urllib3 botocore boto3 s3fs aiobotocore -y")

    print("\n2️⃣ Install compatible versions:")
    print("   pip install urllib3==1.26.18")
    print("   pip install botocore==1.31.85")
    print("   pip install boto3==1.28.85")

    print("\n3️⃣ Install sentence-transformers:")
    print("   pip install sentence-transformers==2.2.2")

    print("\n4️⃣ Install other requirements:")
    print("   pip install chromadb==0.4.22 tqdm pandas numpy openpyxl")

    print("\n🔄 Alternative method (create new environment):")
    print("   conda create -n rag_env python=3.11")
    print("   conda activate rag_env")
    print("   pip install sentence-transformers==2.2.2 chromadb pandas numpy tqdm openpyxl")

    print("\n⚠️  If issues persist, try this minimal version:")
    print("   pip install --upgrade --force-reinstall sentence-transformers")

# Alternative RAG System using OpenAI embeddings (if sentence-transformers fails)
class SimpleRAGSystem:
    """
    Simplified RAG system that works without sentence-transformers
    Uses basic TF-IDF for demonstration purposes.
    """

    def __init__(self, collection_name: str = "simple_reviews"):
        """Initialize simple RAG system."""
        self.collection_name = collection_name
        self.reviews_data = []
        self.embeddings = None

        # Try to import scikit-learn for TF-IDF
        try:
            from sklearn.feature_extraction.text import TfidfVectorizer
            from sklearn.metrics.pairwise import cosine_similarity
            self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
            self.use_tfidf = True
            print("✅ Using TF-IDF for embeddings (scikit-learn)")
        except ImportError:
            print("⚠️  scikit-learn not available, using basic keyword matching")
            self.use_tfidf = False

    def load_and_process_data(self, file_path: str):
        """Load and process reviews data."""
        print(f"📂 Loading dataset: {file_path}")

        try:
            # Load data
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.json'):
                df = pd.read_json(file_path, lines=True)
            elif file_path.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(file_path)
            else:
                raise ValueError("Unsupported file format")

            print(f"✅ Loaded {len(df)} reviews")

            # Process data
            processed_data = []
            for idx, row in df.iterrows():
                # Find text columns
                review_text = ""
                summary = ""

                for col in df.columns:
                    col_lower = col.lower()
                    if 'review' in col_lower and 'text' in col_lower:
                        review_text = str(row[col]) if pd.notna(row[col]) else ""
                    elif 'summary' in col_lower:
                        summary = str(row[col]) if pd.notna(row[col]) else ""

                combined_text = f"{summary} {review_text}".strip()

                if len(combined_text) > 10:  # Only keep non-empty reviews
                    review_data = {
                        'id': f"review_{idx}",
                        'text': combined_text,
                        'metadata': {
                            'rating': row.get('Overall Rating', 0) if 'Overall Rating' in df.columns else 0,
                            'reviewer': str(row.get('Reviewer Name', 'Anonymous')) if 'Reviewer Name' in df.columns else 'Anonymous',
                            'product': str(row.get('ASIN', 'Unknown')) if 'ASIN' in df.columns else 'Unknown'
                        }
                    }
                    processed_data.append(review_data)

            self.reviews_data = processed_data
            print(f"✅ Processed {len(self.reviews_data)} valid reviews")

            # Create embeddings
            self._create_embeddings()

        except Exception as e:
            print(f"❌ Error processing data: {e}")
            raise

    def _create_embeddings(self):
        """Create embeddings using available method."""
        if not self.reviews_data:
            return

        texts = [review['text'] for review in self.reviews_data]

        if self.use_tfidf:
            print("🤖 Creating TF-IDF embeddings...")
            self.embeddings = self.vectorizer.fit_transform(texts)
            print(f"✅ Created embeddings with shape: {self.embeddings.shape}")
        else:
            print("✅ Using keyword-based matching (no embeddings)")

    def search_reviews(self, query: str, n_results: int = 5):
        """Search for relevant reviews."""
        if not self.reviews_data:
            return {"error": "No data loaded"}

        print(f"🔍 Searching for: '{query}'")

        if self.use_tfidf and self.embeddings is not None:
            # TF-IDF based search
            from sklearn.metrics.pairwise import cosine_similarity

            query_vec = self.vectorizer.transform([query])
            similarities = cosine_similarity(query_vec, self.embeddings).flatten()

            # Get top results
            top_indices = similarities.argsort()[-n_results:][::-1]

            results = []
            for idx in top_indices:
                if similarities[idx] > 0:  # Only include relevant results
                    results.append({
                        'review': self.reviews_data[idx],
                        'similarity': float(similarities[idx])
                    })

        else:
            # Keyword-based search
            query_words = query.lower().split()
            scored_reviews = []

            for review in self.reviews_data:
                text_lower = review['text'].lower()
                score = sum(1 for word in query_words if word in text_lower)
                if score > 0:
                    scored_reviews.append({
                        'review': review,
                        'similarity': score / len(query_words)
                    })

            # Sort by score and take top results
            scored_reviews.sort(key=lambda x: x['similarity'], reverse=True)
            results = scored_reviews[:n_results]

        print(f"✅ Found {len(results)} relevant reviews")
        return {
            'query': query,
            'results': results,
            'method': 'tfidf' if self.use_tfidf else 'keyword'
        }

    def answer_question(self, question: str, n_context: int = 5):
        """Generate answer based on relevant reviews."""
        search_results = self.search_reviews(question, n_context)

        if 'error' in search_results:
            return search_results

        if not search_results['results']:
            return {
                'question': question,
                'answer': "No relevant reviews found for your question.",
                'context_reviews': []
            }

        # Extract context
        context_reviews = []
        all_text = ""
        ratings = []

        for result in search_results['results']:
            review_data = result['review']
            context_reviews.append({
                'text': review_data['text'][:300] + "..." if len(review_data['text']) > 300 else review_data['text'],
                'rating': review_data['metadata']['rating'],
                'reviewer': review_data['metadata']['reviewer'],
                'similarity': result['similarity']
            })

            all_text += " " + review_data['text'].lower()
            if isinstance(review_data['metadata']['rating'], (int, float)) and review_data['metadata']['rating'] > 0:
                ratings.append(review_data['metadata']['rating'])

        # Generate simple answer
        answer = self._generate_simple_answer(question, all_text, ratings, len(context_reviews))

        return {
            'question': question,
            'answer': answer,
            'context_reviews': context_reviews,
            'search_method': search_results['method']
        }

    def _generate_simple_answer(self, question: str, all_text: str, ratings: List, n_reviews: int):
        """Generate a simple answer based on the context."""
        answer_parts = [f"Based on analysis of {n_reviews} relevant reviews:"]

        # Average rating
        if ratings:
            avg_rating = sum(ratings) / len(ratings)
            answer_parts.append(f"Average rating: {avg_rating:.1f}/5.0")

        # Sentiment analysis (basic)
        positive_words = ["good", "great", "excellent", "amazing", "love", "perfect", "recommend", "satisfied"]
        negative_words = ["bad", "terrible", "awful", "hate", "worst", "horrible", "disappointed", "poor"]

        positive_count = sum(1 for word in positive_words if word in all_text)
        negative_count = sum(1 for word in negative_words if word in all_text)

        if positive_count > negative_count:
            answer_parts.append("Overall sentiment is positive.")
        elif negative_count > positive_count:
            answer_parts.append("Overall sentiment is negative.")
        else:
            answer_parts.append("Mixed sentiment in reviews.")

        # Question-specific insights
        question_lower = question.lower()
        if "quality" in question_lower:
            quality_mentions = all_text.count("quality")
            if quality_mentions > 0:
                answer_parts.append(f"Quality is mentioned {quality_mentions} times.")

        if "price" in question_lower or "cost" in question_lower:
            price_mentions = all_text.count("price") + all_text.count("cost") + all_text.count("expensive") + all_text.count("cheap")
            if price_mentions > 0:
                answer_parts.append(f"Price is discussed {price_mentions} times.")

        if "delivery" in question_lower or "shipping" in question_lower:
            shipping_mentions = all_text.count("delivery") + all_text.count("shipping") + all_text.count("fast")
            if shipping_mentions > 0:
                answer_parts.append(f"Delivery/shipping mentioned {shipping_mentions} times.")

        return " ".join(answer_parts)

def main():
    """Main function with dependency checking and fallback options."""
    print("🎯 Product Reviews RAG System (Fixed Version)")
    print("=" * 60)

    # Check dependencies
    deps_ok, missing = setup_dependencies()

    if not deps_ok:
        fix_dependencies()
        print("\n❌ Please fix dependencies first, then run the script again.")
        return

    # Import after dependency check
    try:
        from sentence_transformers import SentenceTransformer
        import chromadb
        use_full_system = True
        print("✅ All dependencies available - using full RAG system")
    except ImportError:
        use_full_system = False
        print("⚠️  Using simplified RAG system due to import issues")

    if use_full_system:
        # Use the original full system
        try:
            from sentence_transformers import SentenceTransformer
            import chromadb

            class ProductReviewRAGSystem:
                """Full RAG system implementation."""
                def __init__(self, model_name="all-MiniLM-L6-v2", collection_name="reviews", persist_dir="./chroma_db"):
                    os.makedirs(persist_dir, exist_ok=True)
                    self.model = SentenceTransformer(model_name)
                    self.client = chromadb.PersistentClient(path=persist_dir)
                    self.collection = self.client.get_or_create_collection(name=collection_name)
                    print(f"✅ Full RAG system initialized with {model_name}")

                def build_database(self, file_path: str):
                    """Build the vector database."""
                    # Load data
                    if file_path.endswith('.csv'):
                        df = pd.read_csv(file_path)
                    else:
                        raise ValueError("Only CSV supported in this demo")

                    # Process reviews
                    texts = []
                    metadatas = []
                    ids = []

                    for idx, row in df.iterrows():
                        review_text = str(row.get('Review Text', ''))
                        summary = str(row.get('Summary', ''))
                        combined = f"{summary} {review_text}".strip()

                        if len(combined) > 10:
                            texts.append(combined)
                            metadatas.append({
                                'rating': float(row.get('Overall Rating', 0)),
                                'reviewer': str(row.get('Reviewer Name', 'Anonymous')),
                                'product': str(row.get('ASIN', 'Unknown'))
                            })
                            ids.append(f"review_{idx}")

                    # Generate embeddings
                    print(f"🤖 Generating embeddings for {len(texts)} reviews...")
                    embeddings = self.model.encode(texts, show_progress_bar=True)

                    # Store in ChromaDB
                    print("💾 Storing in vector database...")
                    batch_size = 100
                    for i in range(0, len(texts), batch_size):
                        end_idx = min(i + batch_size, len(texts))
                        self.collection.add(
                            documents=texts[i:end_idx],
                            embeddings=embeddings[i:end_idx].tolist(),
                            metadatas=metadatas[i:end_idx],
                            ids=ids[i:end_idx]
                        )

                    print(f"✅ Successfully stored {len(texts)} reviews!")

                def query(self, question: str, n_results: int = 5):
                    """Query the database."""
                    query_embedding = self.model.encode([question])[0].tolist()
                    results = self.collection.query(
                        query_embeddings=[query_embedding],
                        n_results=n_results
                    )

                    # Format response
                    context_reviews = []
                    if results['documents'] and results['documents'][0]:
                        for i, doc in enumerate(results['documents'][0]):
                            metadata = results['metadatas'][0][i] if results['metadatas'] else {}
                            context_reviews.append({
                                'text': doc[:300] + "..." if len(doc) > 300 else doc,
                                'rating': metadata.get('rating', 'N/A'),
                                'reviewer': metadata.get('reviewer', 'Anonymous')
                            })

                    return {
                        'question': question,
                        'context_reviews': context_reviews,
                        'answer': f"Found {len(context_reviews)} relevant reviews. Average rating: {np.mean([r['rating'] for r in context_reviews if isinstance(r['rating'], (int, float))]):.1f}" if context_reviews else "No relevant reviews found."
                    }

            # Use full system
            rag_system = ProductReviewRAGSystem()

        except Exception as e:
            print(f"❌ Error initializing full system: {e}")
            use_full_system = False

    if not use_full_system:
        # Use simplified system
        rag_system = SimpleRAGSystem()

    # Get dataset path
    dataset_path = input("\n📁 Enter path to your reviews dataset (CSV file): ").strip()

    if not os.path.exists(dataset_path):
        print(f"❌ File not found: {dataset_path}")
        return

    # Build/load database
    try:
        if use_full_system:
            rag_system.build_database(dataset_path)
        else:
            rag_system.load_and_process_data(dataset_path)

        print(f"\n✅ Database ready!")

    except Exception as e:
        print(f"❌ Error building database: {e}")
        return

    # Interactive querying
    print("\n" + "=" * 50)
    print("🤖 RAG System Ready! Ask questions about the reviews.")
    print("Type 'quit' to exit.")
    print("=" * 50)

    while True:
        try:
            question = input("\n❓ Your question: ").strip()

            if question.lower() in ['quit', 'exit', 'q']:
                print("👋 Goodbye!")
                break

            if not question:
                continue

            # Get answer
            if use_full_system:
                response = rag_system.query(question)
            else:
                response = rag_system.answer_question(question)

            # Display results
            print(f"\n🎯 Question: {response['question']}")
            print(f"🤖 Answer: {response['answer']}")

            if response.get('context_reviews'):
                print(f"📚 Based on {len(response['context_reviews'])} reviews")

                show_context = input("🔍 Show context reviews? (y/n): ").strip().lower()
                if show_context == 'y':
                    for i, review in enumerate(response['context_reviews'][:3], 1):
                        print(f"\n   Review {i}:")
                        print(f"   ⭐ Rating: {review['rating']}")
                        print(f"   👤 Reviewer: {review.get('reviewer', 'N/A')}")
                        print(f"   📝 Text: {review['text']}")

        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")

if __name__ == "__main__":
    main()