## 1. Install and Import the Required Libraries

In [1]:
# Install all the required libraries
!pip install -U -q openai chromadb sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.7/20.7 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.0/488.0 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.1/103.1 kB[0m

In [3]:
# Import all the required Libraries

from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import openai
import ast
import time
import requests
from chromadb import Client, Settings
from sentence_transformers import SentenceTransformer, CrossEncoder
from typing import List, Dict, Any, Tuple
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction 
from bs4 import BeautifulSoup
import re
import hashlib
from datetime import datetime, timedelta
import numpy as np
from openai import OpenAI

## Layer 1: The Embedding Layer (Processing, Cleaning, Chunking)

### Data Processing and Cleaning

In [4]:
# Myntra fashion data set path
file_path = "/kaggle/input/myntra-fashion-product-dataset/Fashion Dataset v2.csv"

In [36]:
# --- GLOBAL CONFIGURATION (Simulating OpenAI RAG Stack) ---
# Embedding Model (Bi-Encoder): OpenAI's model for quality semantic retrieval
EMBEDDING_MODEL_NAME = 'text-embedding-ada-002' 
# Cross-Encoder (Re-ranker): HuggingFace model for post-retrieval ranking
RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2' 
# LLM (Generator): OpenAI's powerful model for synthesis
LLM_MODEL_NAME = 'gpt-3.5-turbo' 

CHROMA_COLLECTION_NAME = "fashion_products_reranked"
CHROMA_CACHE_COLLECTION_NAME = "cache_fashion_products_reranked"
CHROMA_PATH = "chroma_db_reranked"

# API Key Placeholder: Replace 'YOUR_API_KEY_HERE' with your actual OpenAI API key.
# The script relies on this key for the OpenAIEmbeddingFunction.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY_HERE")

# Global cache for storing final, re-ranked search results to avoid re-execution
QUERY_CACHE: Dict[str, Tuple[List[Dict[str, Any]], str]] = {}

In [6]:
 def clean_text(text):
     """Cleans product text by removing HTML tags and extra whitespace."""
     if pd.isna(text):
         return ""
         
     soup = BeautifulSoup(text, 'html.parser')
     cleaned = soup.get_text()
     cleaned = re.sub(r'\s+', ' ', cleaned)
     return cleaned.strip()

In [7]:
def preprocess_data(csv_path):
    """Clean and preprocess the dataset"""
    df = pd.read_csv(csv_path)
    # Clean description
    df['clean_description'] = df['description'].apply(clean_text)
    
    # Fill missing values
    df['ratingCount'] = df['ratingCount'].fillna(0)
    df['avg_rating'] = df['avg_rating'].fillna(0.0)
    
    # Create a comprehensive text field for embedding
    df['full_text'] = df.apply(create_full_text, axis=1)
    
    return df

In [8]:
def create_full_text(row):
    """Combine all relevant fields into a single searchable text"""
    parts = [
        f"Product: {row['name']}",
        f"Brand: {row['brand']}",
        f"Price: ₹{row['price']}",
        f"Color: {row['colour']}",
        f"Type: {row['products']}",
        f"Rating: {row['avg_rating']:.1f}/5 ({int(row['ratingCount'])} reviews)",
        f"Description: {row['clean_description']}"
    ]
    return " | ".join(parts)

In [9]:
data = preprocess_data(file_path)

In [10]:
data.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes,clean_description,full_text
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Black printed Kurta with Palazzos with dupatta...,Product: Khushal K Women Black Ethnic Motifs P...
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Orange solid Kurta with Palazzos with dupattaK...,Product: InWeave Women Orange Solid Kurta with...
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Navy blue embroidered Kurta with Trousers with...,Product: Anubhutee Women Navy Blue Ethnic Moti...
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Red printed kurta with trouser and dupattaKurt...,Product: Nayo Women Red Floral Printed Kurta W...
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...","Black and green printed straight kurta, has a ...",Product: AHIKA Women Black & Green Printed Str...


### Data Chunking Strategy

In [11]:
 def chunking_product_based(df: pd.DataFrame) -> List[Dict[str, Any]]:
     """
     Strategy: Product-based chunking
     Each product is one semantic unit        
     """
     chunks = []
     for idx, row in df.iterrows():
         chunks.append({
             'chunk_id': f"prod_{row['p_id']}",
             'product_id': row['p_id'],
             'text': row['full_text'],
             'metadata': {
                 'name': row['name'],
                 'brand': row['brand'],
                 'price': float(row['price']),
                 'colour': row['colour'],
                 'products': row['products'],
                 'rating': float(row['avg_rating']),
                 'image_url': row['img'],
                 'chunk_strategy': 'product_based'
             }
         })
     
     return chunks

In [12]:
def chunking_fixed_size(df: pd.DataFrame, chunk_size: int = 512) -> List[Dict[str, Any]]:
    """
    Strategy 1: Fixed-size chunking
    Splits long descriptions into fixed-size chunks
    Pros: Simple, predictable
    Cons: May break semantic meaning
    """
    chunks = []
    for idx, row in df.iterrows():
        text = row['full_text']
        
        # Split into chunks
        for i in range(0, len(text), chunk_size):
            chunk_text = text[i:i + chunk_size]
            chunks.append({
                'chunk_id': f"{row['p_id']}_chunk_{i // chunk_size}",
                'product_id': row['p_id'],
                'text': chunk_text,
                'metadata': {
                    'name': row['name'],
                    'brand': row['brand'],
                    'price': float(row['price']),
                    'colour': row['colour'],
                    'products': row['products'],
                        'rating': float(row['avg_rating']),
                        'image_url': row['img'],
                        'chunk_strategy': 'fixed_size'
                    }
            })
        
    return chunks

In [13]:
def chunking_semantic_sections(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Strategy 3: Semantic section-based chunking
    Splits products into semantic sections (basic info, pricing, description)
    """
    chunks = []
    for idx, row in df.iterrows():
        product_id = row['p_id']
        
        # Chunk 1: Basic product info
        basic_info = f"Product: {row['name']} | Brand: {row['brand']} | Type: {row['products']} | Color: {row['colour']}"
        chunks.append({
            'chunk_id': f"{product_id}_basic",
            'product_id': product_id,
            'text': basic_info,
            'metadata': {
                'name': row['name'],
                'brand': row['brand'],
                'price': float(row['price']),
                'colour': row['colour'],
                'products': row['products'],
                'rating': float(row['avg_rating']),
                'image_url': row['img'],
                'chunk_type': 'basic_info',
                'chunk_strategy': 'semantic_sections'
            }
        })
            
        # Chunk 2: Pricing and ratings
        pricing_info = f"{row['name']} by {row['brand']} | Price: ₹{row['price']} | Rating: {row['avg_rating']:.1f}/5 based on {int(row['ratingCount'])} customer reviews"
        chunks.append({
            'chunk_id': f"{product_id}_pricing",
            'product_id': product_id,
            'text': pricing_info,
            'metadata': {
                'name': row['name'],
                'brand': row['brand'],
                'price': float(row['price']),
                'colour': row['colour'],
                'products': row['products'],
                'rating': float(row['avg_rating']),
                'image_url': row['img'],
                'chunk_type': 'pricing',
                'chunk_strategy': 'semantic_sections'
            }
        })
            
        # Chunk 3: Detailed description
        if row['clean_description']:
            desc_text = f"{row['name']} - {row['clean_description']}"
            chunks.append({
                'chunk_id': f"{product_id}_description",
                'product_id': product_id,
                'text': desc_text,
                'metadata': {
                    'name': row['name'],
                    'brand': row['brand'],
                    'price': float(row['price']),
                    'colour': row['colour'],
                    'products': row['products'],
                    'rating': float(row['avg_rating']),
                    'image_url': row['img'],
                    'chunk_type': 'description',
                    'chunk_strategy': 'semantic_sections'
                }
            })
        
        return chunks

In [14]:
# Chunking Strategy based on product
product_based_chunks = chunking_product_based(data)
print(f"Created {len(product_based_chunks)} Product based chunks")

Created 14214 Product based chunks


In [15]:
# Chunking Stragedy based semantic sections
semantic_sections_chunks = chunking_semantic_sections(data)
print(f"Created {len(semantic_sections_chunks)} semantic sections based chunks")

Created 3 semantic sections based chunks


In [16]:
# Chunking Stragedy based on fixed size
fixed_sized_chunks = chunking_fixed_size(data)
print(f"Created {len(fixed_sized_chunks)} fixed sized based chunks")

Created 17816 fixed sized based chunks


### Embedding Generation and ChromaDB Vector database operations

In [17]:
import chromadb

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)

In [18]:
# OpenAI embedding function
embedding_function = OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name=EMBEDDING_MODEL_NAME
)

In [19]:
# Create or get collection
collection = chroma_client.get_or_create_collection(
    name=CHROMA_COLLECTION_NAME,
    embedding_function=embedding_function,
    metadata={"description": "Fashion product embeddings"}
)

In [20]:
 def add_chunks_to_db(chunks: List[Dict[str, Any]]) -> None:
    """Add chunks to ChromaDB"""
    print(f"Adding {len(chunks)} chunks to ChromaDB...")
    
    ids = [chunk['chunk_id'] for chunk in chunks]
    documents = [chunk['text'] for chunk in chunks]
    metadatas = [chunk['metadata'] for chunk in chunks]
    
    # Add in batches to avoid API limits
    batch_size = 1000
    for i in range(0, len(chunks), batch_size):
        batch_ids = ids[i:i+batch_size]
        batch_docs = documents[i:i+batch_size]
        batch_meta = metadatas[i:i+batch_size]
        
        collection.add(
            ids=batch_ids,
            documents=batch_docs,
            metadatas=batch_meta
        )
        print(f"Processed {min(i+batch_size, len(chunks))}/{len(chunks)} chunks")
    
    print("All chunks added to ChromaDB")

In [21]:
# # Add the chunks and metadata to the collection along with generic integer IDs.
add_chunks_to_db(product_based_chunks)

print(f"Vector database ready: {collection.count()} chunks")

Adding 14214 chunks to ChromaDB...
Processed 1000/14214 chunks
Processed 2000/14214 chunks
Processed 3000/14214 chunks
Processed 4000/14214 chunks
Processed 5000/14214 chunks
Processed 6000/14214 chunks
Processed 7000/14214 chunks
Processed 8000/14214 chunks
Processed 9000/14214 chunks
Processed 10000/14214 chunks
Processed 11000/14214 chunks
Processed 12000/14214 chunks
Processed 13000/14214 chunks
Processed 14000/14214 chunks
Processed 14214/14214 chunks
All chunks added to ChromaDB
Vector database ready: 14214 chunks


In [22]:
# Let's take a look at the first few entries in the collection

collection.get(
    ids = ['prod_17048614','prod_16524740','prod_16331376'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['prod_17048614', 'prod_16524740', 'prod_16331376'],
 'embeddings': array([[-0.02576004,  0.00104004,  0.0129606 , ...,  0.00408628,
          0.01067067, -0.01533783],
        [-0.00019932, -0.00751893, -0.0128093 , ..., -0.01083864,
          0.02223276, -0.01213477],
        [-0.02945477, -0.00898833,  0.01080448, ..., -0.01870312,
          0.0123961 , -0.01075165]]),
 'documents': ["Product: Khushal K Women Black Ethnic Motifs Printed Kurta with Palazzos & With Dupatta | Brand: Khushal K | Price: ₹5099.0 | Color: Black | Type: Kurta, Palazzos, Dupatta | Rating: 4.4/5 (4522 reviews) | Description: Black printed Kurta with Palazzos with dupatta Kurta design: Ethnic motifs printed Anarkali shape Regular style Mandarin collar, three-quarter regular sleeves Calf length with flared hem Viscose rayon machine weave fabric Palazzos design: Printed Palazzos Elasticated waistband Slip-on closure Dupatta Length 2.43 meters Width: 88 cmThe model (height 5'8) is wearing a size S100% Ray

In [23]:
# Cache Collection
cache_collection = chroma_client.get_or_create_collection(
    name=CHROMA_CACHE_COLLECTION_NAME, 
    embedding_function=embedding_function,
    metadata={"description": "Persistent cache for search results"}
)

In [24]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

## Layer 2: The Search Layer (Retrieval, Re-ranking, & Caching)

### Search Module with Caching

In [25]:
# Caches search results in a persistent ChromaDB collection.
# Each cache entry is uniquely identified by a hash of (query, n_results).

def make_key(query: str, n_results: int) -> str:
    cache_key = json.dumps({"query": query, "n_results": n_results})
    return hashlib.sha256(cache_key.encode()).hexdigest()

def get_cache_collection(query: str, n_results: int) -> Dict[str, Any] or None:
    key = make_key(query, n_results)
    results = cache_collection.get(ids=[key], include=["metadatas", "documents"])
    
    if results and results["ids"]:
        metadata = results["metadatas"][0]
        # Parse timestamp from metadata for TTL check
        last_accessed = datetime.fromisoformat(metadata.get("timestamp", "1970-01-01T00:00:00"))
        if datetime.now() - last_accessed < timedelta(hours=1):
            # Return the cached search (stored as document JSON)
            return json.loads(results["documents"][0])
        # remove stale cache
        cache_collection.delete(ids=[key])
    return None

def set_cache_collection(query: str, n_results: int, search_result: Dict[str, Any]) -> None:
    key = make_key(query, n_results)
    timestamp = datetime.now().isoformat()
    metadata = {
        "query": query,
        "n_results": n_results,
        "timestamp": timestamp
    }
    cache_collection.add(documents=[json.dumps(search_result)], metadatas=[metadata], ids=[key])


def clear():
    # Clear all cache entries
    all_ids = cache_collection.get(include=[])["ids"]
    if all_ids:
        cache_collection.delete(ids=all_ids)

### Search and re-ranking operations

In [26]:
 # Initialise the cross encoder model
cross_encoder = CrossEncoder(RE_RANKER_MODEL_NAME) #max_length=512

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [27]:
# Handles search and re-ranking operations

def search(query: str, n_results: int = 10, use_cache: bool = True) -> Dict[str, Any]:
        """
        Search ChromaDB with caching
        
        Args:
            query: Search query
            n_results: Number of results to retrieve
            use_cache: Whether to use cache
            
        Returns:
            Dictionary with search results
        """
        # Check cache first
        if use_cache:
            cached_results = get_cache_collection(query, n_results)
            if cached_results is not None:
                return cached_results
        
        # Perform search
        print(f"\nSearching for: '{query}'")
        results = collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        # Format results
        formatted_results = {
            'query': query,
            'n_results': n_results,
            'ids': results['ids'][0],
            'documents': results['documents'][0],
            'metadatas': results['metadatas'][0],
            'distances': results['distances'][0]
        }
        
        # Cache results
        if use_cache:
            set_cache_collection(query, n_results, formatted_results)
        
        return formatted_results

In [28]:
def rerank_results(query: str, search_results: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
    """
    Re-rank search results using cross-encoder
    
    Args:
        query: Original search query
        search_results: Results from vector search
        top_k: Number of top results to return after re-ranking
        
    Returns:
        List of re-ranked results with scores
    """
    documents = search_results['documents']
    metadatas = search_results['metadatas']
    ids = search_results['ids']
    
    # Create query-document pairs
    pairs = [(query, doc) for doc in documents]
    
    # Get cross-encoder scores
    print(f"Re-ranking {len(documents)} results...")
    scores = cross_encoder.predict(pairs)
    
    # Sort by score (descending)
    ranked_indices = np.argsort(scores)[::-1]
        
    # Get top_k results
    reranked_results = []
    for idx in ranked_indices[:top_k]:
        reranked_results.append({
            'rank': len(reranked_results) + 1,
            'chunk_id': ids[idx],
            'document': documents[idx],
            'metadata': metadatas[idx],
            'rerank_score': float(scores[idx]),
            'original_distance': search_results['distances'][idx]
        })
    
    return reranked_results

In [29]:
def search_products(query: str, retrieve_n: int = 10, top_k: int = 3, 
                      use_cache: bool = True) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
    """
    Complete search pipeline: retrieve -> re-rank
    
    Args:
        query: Search query
        retrieve_n: Number of results to retrieve initially
        top_k: Number of top results after re-ranking
        use_cache: Whether to use cache
        
    Returns:
        Tuple of (search_results, reranked_results)
    """
    # Step 1: Vector search
    search_results = search(query, n_results=retrieve_n, use_cache=use_cache)
    
    # Step 2: Re-rank
    reranked_results = rerank_results(query, search_results, top_k=top_k)
    
    return search_results, reranked_results

## Layer 3: The Generation Layer (LLM Simulation)
### Retrieval Augmented Generation
Now that we have the final top search results, we can pass it to an GPT 3.5 along with the user query and a well-engineered prompt, to generate a direct answer to the query for product recommendations.

In [30]:
# Handles LLM-based generation for product recommendations
def create_recommendation_prompt(query: str, products: List[Dict[str, Any]], 
                                include_examples: bool = True) -> str:
    """
    Create comprehensive prompt for product recommendation
    
    Args:
        query: User's search query
        products: List of re-ranked product results
        include_examples: Whether to include few-shot examples
        
    Returns:
        Complete prompt string
    """
    
    # Few-shot examples (optional)
    few_shot_examples = ""
    if include_examples:
        few_shot_examples = """
## Examples:

**Query:** "I need a formal black dress for office wear"
**Recommendation:**
Based on your search for formal black office wear, I recommend:

1. **Professional Black A-Line Dress by Van Heusen** (₹2,499)
   - Perfect for office settings with its sophisticated A-line silhouette
   - Features a classic mandarin collar and three-quarter sleeves
   - Made from comfortable cotton blend material
   - Rating: 4.3/5 (highly rated by professionals)
   - Ideal for: Daily office wear, business meetings, formal presentations

2. **Elegant Black Sheath Dress by Allen Solly** (₹3,199)
   - Timeless sheath design that flatters all body types
   - Knee-length with comfortable stretch fabric
   - Easy to style with blazers or statement jewelry
   - Rating: 4.5/5 (customer favorite)
   - Ideal for: Corporate events, client meetings, office parties

These options combine professionalism with comfort and style, perfect for your office wardrobe.

---

**Query:** "Summer cotton kurta under 2000"
**Recommendation:**
Great choice for comfortable summer wear! Here are my top picks under ₹2,000:

1. **Lightweight Cotton Kurta by Libas** (₹1,299)
   - 100% breathable cotton perfect for hot weather
   - Beautiful floral print in vibrant colors
   - Straight cut with comfortable fit
   - Rating: 4.4/5 (loved for comfort)
   - Ideal for: Casual outings, work from home, everyday wear

2. **Printed Cotton Anarkali by Sangria** (₹1,799)
   - Airy Anarkali style with flared bottom
   - Soft cotton fabric with ethnic prints
   - Comes with matching dupatta
   - Rating: 4.2/5
   - Ideal for: Festive occasions, casual parties, shopping trips

Both options offer excellent value and will keep you cool and stylish all summer!

---
"""
        
        # Build product context
        product_context = "\n\n".join([
            f"""**Product {i+1}:**
- Name: {p['metadata']['name']}
- Brand: {p['metadata']['brand']}
- Price: ₹{p['metadata']['price']}
- Color: {p['metadata']['colour']}
- Type: {p['metadata']['products']}
- Rating: {p['metadata']['rating']}/5
- Full Description: {p['document']}
- Relevance Score: {p['rerank_score']:.4f}"""
            for i, p in enumerate(products)
        ])
        
        prompt = f"""You are an expert fashion consultant and personal stylist working for Myntra, India's leading fashion e-commerce platform. Your role is to provide thoughtful, personalized product recommendations based on customer queries.

## Instructions:

1. **Analyze the Query**: Understand the customer's specific needs, preferences, and context
2. **Review Products**: Carefully examine the top-ranked products provided below
3. **Provide Recommendations**: Write a comprehensive, friendly recommendation that includes:
   - A warm, personalized introduction addressing their needs
   - Detailed recommendations for each product (2-3 products)
   - Specific reasons why each product suits their query
   - Practical styling tips or use-case suggestions
   - Price-value analysis when relevant
   - Any important considerations (size, fit, care instructions)

4. **Tone & Style**:
   - Be conversational yet professional
   - Show genuine enthusiasm for fashion
   - Use second person ("you", "your") to make it personal
   - Be specific and actionable, not generic
   - Highlight unique features and benefits

5. **Format**:
   - Use clear sections with headings
   - Use bullet points for easy scanning
   - Include emojis sparingly for visual appeal (optional)
   - Keep each product recommendation detailed but concise

{few_shot_examples}

## Customer Query:
"{query}"

## Top Matching Products:
{product_context}

## Your Recommendation:
"""
        
    return prompt

In [31]:
def generate_recommendation(query: str, reranked_products: List[Dict[str, Any]], 
                            temperature: float = 0.7, max_tokens: int = 1000) -> str:
    """
    Generate product recommendation using GPT
    
    Args:
        query: User's search query
        reranked_products: Top products after re-ranking
        temperature: Generation temperature (0-1)
        max_tokens: Maximum tokens to generate
        
    Returns:
        Generated recommendation text
    """
    prompt = create_recommendation_prompt(query, reranked_products)
    
    print(f"\nGenerating recommendation with {LLM_MODEL_NAME}...")

    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        model=LLM_MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are an expert fashion consultant providing personalized product recommendations."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
        
    recommendation = response.choices[0].message.content
        
    return recommendation

In [32]:
def complete_pipeline(query: str, retrieve_n: int = 10, top_k: int = 3) -> dict:
    """
    Run complete search and recommendation pipeline
    
    Returns:
        Dictionary containing all results
    """
    print("\n" + "=" * 60)
    print(f"QUERY: {query}")
    print("=" * 60)
    
    # Search and re-rank
    search_results, reranked_results = search_products(
        query=query,
        retrieve_n=retrieve_n,
        top_k=top_k
    )
    
    # Generate recommendation
    recommendation = generate_recommendation(query, reranked_results)
    
    return {
        'query': query,
        'initial_results_count': len(search_results['documents']),
        'top_k_results': reranked_results,
        'recommendation': recommendation
    }

In [33]:
def display_results(results: dict) -> None:
    """Display results in a formatted way"""
    print("\n" + "=" * 60)
    print("TOP RETRIEVED CHUNKS (After Re-ranking)")
    print("=" * 60)
    
    for result in results['top_k_results']:
        print(f"\n[Rank {result['rank']}] Score: {result['rerank_score']:.4f}")
        print(f"Product: {result['metadata']['name']}")
        print(f"Brand: {result['metadata']['brand']}")
        print(f"Price: ₹{result['metadata']['price']}")
        print(f"Color: {result['metadata']['colour']}")
        print(f"Rating: {result['metadata']['rating']}/5")
        print(f"Description: {result['document']}")
        print("-" * 60)
    
    print("\n" + "=" * 60)
    print("LLM GENERATED RECOMMENDATION")
    print("=" * 60)
    print(results['recommendation'])
    print("=" * 60)

In [34]:
# Test queries

test_queries = [
        "Black ethnic kurta with palazzo for women under 6000",
        "Formal office wear dress with good rating",
        "Traditional Indian wedding outfit in red or maroon"
    ]

# Run complete pipeline for each query
for query in test_queries:
    results = complete_pipeline(query, retrieve_n=10, top_k=3)
    display_results(results)
    print("\n\n")


QUERY: Black ethnic kurta with palazzo for women under 6000

Searching for: 'Black ethnic kurta with palazzo for women under 6000'
Re-ranking 10 results...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Generating recommendation with gpt-3.5-turbo...

TOP RETRIEVED CHUNKS (After Re-ranking)

[Rank 1] Score: 5.4974
Product: Khushal K Women Black Ethnic Motifs Printed Kurta with Palazzos & With Dupatta
Brand: Khushal K
Price: ₹5099.0
Color: Black
Rating: 4.4183989385227775/5
Description: Product: Khushal K Women Black Ethnic Motifs Printed Kurta with Palazzos & With Dupatta | Brand: Khushal K | Price: ₹5099.0 | Color: Black | Type: Kurta, Palazzos, Dupatta | Rating: 4.4/5 (4522 reviews) | Description: Black printed Kurta with Palazzos with dupatta Kurta design: Ethnic motifs printed Anarkali shape Regular style Mandarin collar, three-quarter regular sleeves Calf length with flared hem Viscose rayon machine weave fabric Palazzos design: Printed Palazzos Elasticated waistband Slip-on closure Dupatta Length 2.43 meters Width: 88 cmThe model (height 5'8) is wearing a size S100% RayonMachine wash
------------------------------------------------------------

[Rank 2] Score: 5.3453
Product: A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Generating recommendation with gpt-3.5-turbo...

TOP RETRIEVED CHUNKS (After Re-ranking)

[Rank 1] Score: -3.4164
Product: MANGO Women Beige SARAH Regular-Fit Double-Breasted Formal Blazer
Brand: MANGO
Price: ₹8990.0
Color: Beige
Rating: 4.4/5
Description: Product: MANGO Women Beige SARAH Regular-Fit Double-Breasted Formal Blazer | Brand: MANGO | Price: ₹8990.0 | Color: Beige | Type: Blazer, Formal | Rating: 4.4/5 (10 reviews) | Description: Beige solid formal blazer, has a notched lapel collar, long sleeves, double-breasted with button closure, two pocketsRegular FitThe model (height 5'8") is wearing a size SFabric: 50% Polyester, 30% Viscose, 19% Wool, 1% Elastane Lining: 50% Polyester, 50% Viscose Piping: 50% Polyester, 50% ViscoseDry Clean
------------------------------------------------------------

[Rank 2] Score: -3.6286
Product: MANGO Women Brown Checked GALATA Regular-Fit Double-Breasted Formal Blazer
Brand: MANGO
Price: ₹7990.0
Color: Brown
Rating: 4.444444444444445/5
Descri

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Generating recommendation with gpt-3.5-turbo...

TOP RETRIEVED CHUNKS (After Re-ranking)

[Rank 1] Score: -0.8735
Product: SWAGG INDIA Women Maroon Ethnic Motifs Chikankari Embroidered Kurta with Palazzos
Brand: SWAGG INDIA
Price: ₹3999.0
Color: Maroon
Rating: 4.8/5
Description: Product: SWAGG INDIA Women Maroon Ethnic Motifs Chikankari Embroidered Kurta with Palazzos | Brand: SWAGG INDIA | Price: ₹3999.0 | Color: Maroon | Type: Kurta, Palazzos | Rating: 4.8/5 (15 reviews) | Description: Maroon embroidered Kurta with PalazzosKurta design: Ethnic motifs Chikankari embroideredStraight shapeRegular styleRound neck, three-quarter regular sleevesSequinned detailKnee length with straight hemPoly georgette machine weave fabricPalazzos design: Solid PalazzosElasticated waistbandSlip-on closure Kurta : Chiffon georgetteBottom: Chiffon georgetteHand-washThe model (height 5'8) is wearing a size M
------------------------------------------------------------

[Rank 2] Score: -2.0809
Product: Vedic