In [6]:
import os
import time
import logging
import re

import pandas as pd
import faiss
from dotenv import load_dotenv

from IPython.display import Image, display
from langchain.docstore.document import Document
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq


In [7]:
logging.basicConfig(
    filename='semantic_search.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [8]:
# Step 3.1: Initialize HuggingFaceEmbeddings
# Specify your HuggingFace model
model_name = "models/fine-tuned-sbert-triplet_20241224_171119_20241226_123849"  # Replace with your model path or name

# Initialize HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name=model_name)
print(f"HuggingFaceEmbeddings initialized with model: {model_name}")
logger.info(f"HuggingFaceEmbeddings initialized with model: {model_name}")


HuggingFaceEmbeddings initialized with model: models/fine-tuned-sbert-triplet_20241224_171119_20241226_123849


In [9]:

# Load Environment Variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not found. Please set it in the .env file.")



In [10]:
llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="Llama3-8b-8192",  # Example model name
    temperature=0.7,
    max_tokens=200
)
logger.info("ChatGroq LLM initialized successfully.")

In [30]:
# Example: Products
df_products = pd.DataFrame({
    "id": [16508, 11349, 51499],
    "name": [
        "Locomotive Men Washed Blue Jeans",
        "Lee Men Blue Chicago Fit Jeans",
        "Denizen Women Blue Jeans"
    ],
    "description": [
        "Casual jeans for men, washed style, perfect for everyday wear.",
        "Men's blue Chicago Fit Jeans, comfortable and stylish for summer.",
        "Women's blue jeans from Denizen, casual and versatile."
    ],
    "gender": ["Men", "Men", "Women"],
    "baseColour": ["Blue", "Blue", "Blue"]
})

# Example: Services
df_services = pd.DataFrame({
    "id": [7001, 7002],
    "serviceName": [
        "Home Painting Service",
        "Car Painting Service"
    ],
    "serviceDescription": [
        "Professional home interior and exterior painting solutions.",
        "Automotive painting service for a new look or repairs."
    ],
    "category": ["Home Improvement", "Automotive"]
})

logger.info(f"Loaded {len(df_products)} products and {len(df_services)} services.")


In [12]:
# Example: Products
df_products = pd.DataFrame({
    "id": [16508, 11349, 51499],
    "name": [
        "Locomotive Men Washed Blue Jeans",
        "Lee Men Blue Chicago Fit Jeans",
        "Denizen Women Blue Jeans"
    ],
    "description": [
        "Casual jeans for men, washed style, perfect for everyday wear.",
        "Men's blue Chicago Fit Jeans, comfortable and stylish for summer.",
        "Women's blue jeans from Denizen, casual and versatile."
    ],
    "gender": ["Men", "Men", "Women"],
    "baseColour": ["Blue", "Blue", "Blue"]
})

# Example: Services
df_services = pd.DataFrame({
    "id": [7001, 7002],
    "serviceName": [
        "Home Painting Service",
        "Car Painting Service"
    ],
    "serviceDescription": [
        "Professional home interior and exterior painting solutions.",
        "Automotive painting service for a new look or repairs."
    ],
    "category": ["Home Improvement", "Automotive"]
})

logger.info(f"Loaded {len(df_products)} products and {len(df_services)} services.")


In [None]:
def create_product_documents(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        page_content = (
            f"Product ID: {row['id']}. "
            f"Name: {row['name']}. "
            f"{row['description']}"
        )
        metadata = {
            "docType": "product",
            "id": row['id'],
            "name": row['name'],
            "description": row['description'],
            "gender": row.get("gender", None),
            "baseColour": row.get("baseColour", None),
        }
        docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

def create_service_documents(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        page_content = (
            f"Service ID: {row['id']}. "
            f"Name: {row['serviceName']}. "
            f"{row['serviceDescription']}"
        )
        metadata = {
            "docType": "service",
            "id": row['id'],
            "serviceName": row['serviceName'],
            "serviceDescription": row['serviceDescription'],
            "category": row.get("category", None),
        }
        docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

product_docs = create_product_documents(df_products)
service_docs = create_service_documents(df_services)

all_docs = product_docs + service_docs
logger.info(f"Created {len(all_docs)} total documents (products + services).")



In [14]:

# ----------------------------------------------------------
# 5. Build FAISS Vector Store (Index & Docstore)
# ----------------------------------------------------------
def build_faiss_vector_store(embedding_function):
    """
    Initialize an empty FAISS store (IndexFlatL2) and return it.
    We'll add documents in batches (see add_documents_to_store).
    """
    try:
        # Determine embedding dimension
        test_emb = embedding_function.embed_query("test")
        embedding_dim = len(test_emb)
        logger.info(f"Embedding dimension: {embedding_dim}")
    except Exception as e:
        logger.error(f"Error determining embedding dimension: {e}")
        raise e

    try:
        index = faiss.IndexFlatL2(embedding_dim)
        docstore = InMemoryDocstore()
        vector_store = FAISS(
            embedding_function=embedding_function,
            index=index,
            docstore=docstore,
            index_to_docstore_id={},
        )
        logger.info(f"FAISS vector store initialized (dimension={embedding_dim}).")
        return vector_store
    except Exception as e:
        logger.error(f"Error initializing FAISS store: {e}")
        raise e

def add_documents_to_store(documents, vector_store, batch_size=100):
    """
    Adds documents to the vector store in batches (best practice for large data).
    """
    total = len(documents)
    for i in range(0, total, batch_size):
        batch = documents[i:i + batch_size]
        vector_store.add_documents(batch)
        logger.info(f"Added documents {i+1} to {i+len(batch)} out of {total}")


# Build the store and add docs in batches
vector_store = build_faiss_vector_store(embeddings)
add_documents_to_store(all_docs, vector_store, batch_size=100)



In [15]:
# Check the number of vectors in FAISS
num_vectors = vector_store.index.ntotal
logger.info(f"FAISS index contains {num_vectors} vectors.")
print(f"FAISS index contains {num_vectors} vectors.")

FAISS index contains 5 vectors.


In [16]:
# 6. Define map_reduce Prompts & Create RetrievalQA Chain
# ----------------------------------------------------------
map_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You have the following chunk of data (could be a product or service):
{context}

User question: {question}

- Summarize any relevant items here, referencing ID and name.
- If nothing is relevant, say so.
""",
)

reduce_prompt = PromptTemplate(
    input_variables=["summaries", "question"],
    template="""
We have partial answers from multiple chunks:
{summaries}

Combine them into a single, cohesive answer to: "{question}"

Requirements:
1) Start with a short summary referencing relevant products or services (by ID and name).
2) Provide bullet points referencing IDs and names.
3) If no relevant items are found, say "No relevant items found."
""",
)

In [18]:
def create_qa_chain(llm, vector_store):
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="map_reduce",
        retriever=vector_store.as_retriever(search_kwargs={"k": 10}),
        return_source_documents=True,
        chain_type_kwargs={
            "question_prompt": map_prompt,
            "combine_prompt": reduce_prompt
        }
    )
    logger.info("map_reduce RetrievalQA chain created successfully.")
    return chain

qa_chain = create_qa_chain(llm, vector_store)


In [28]:
def semantic_search_tool(query):
    """
    1) Calls qa_chain.invoke(query) to run the map_reduce retrieval QA.
    2) Appends product/service metadata (docType=product or service).
    """
    max_retries = 5
    for attempt in range(max_retries):
        try:
            response = qa_chain.invoke(query)

            if isinstance(response, dict):
                llm_result = response.get("result", "No result from LLM.")
                source_docs = response.get("source_documents", [])
            else:
                llm_result = response
                source_docs = []

            if not source_docs:
                return f"**LLM Result:** {llm_result}\n\nNo relevant documents found."

            formatted_results = f"**LLM Result:** {llm_result}\n\n"
            for doc in source_docs:
                meta = doc.metadata
                doc_type = meta.get("docType", "N/A")
                doc_id = meta.get("id", "N/A")

                if doc_type == "product":
                    name = meta.get("name", "N/A")
                    desc = meta.get("description", "")
                    gender = meta.get("gender", "N/A")
                    base_color = meta.get("baseColour", "N/A")

                    product_info = (
                        f"**Product ID:** {doc_id}\n"
                        f"**Product Name:** {name}\n"
                        f"- **Description:** {desc}\n"
                        f"- **Gender:** {gender}\n"
                        f"- **Colour:** {base_color}\n\n"
                    )
                    formatted_results += product_info

                elif doc_type == "service":
                    sname = meta.get("serviceName", "N/A")
                    sdesc = meta.get("serviceDescription", "")
                    category = meta.get("category", "N/A")

                    service_info = (
                        f"**Service ID:** {doc_id}\n"
                        f"**Service Name:** {sname}\n"
                        f"- **Category:** {category}\n"
                        f"- **Description:** {sdesc}\n\n"
                    )
                    formatted_results += service_info

            return formatted_results

        except Exception as e:
            logger.error(f"Attempt {attempt + 1} - Error: {e}")
            if "429" in str(e):
                sleep_time = 2 ** attempt
                logger.info(f"Rate limited. Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                return f"An error occurred: {e}"

    return "An error occurred after multiple attempts."

In [29]:
user_query = "Tell me about painting"
print(f"Query: {user_query}\n")
answer = semantic_search_tool(user_query)
print(answer)


Query: Tell me about painting

**LLM Result:** Here's the combined answer:

As I've analyzed the provided data, here's a summary of relevant information related to painting:

The Car Painting Service (Service ID: 7002, Name: Car Painting Service) offers a new look or repairs, while the Home Painting Service (Service ID: 7001, Name: Home Painting Service) provides "Professional home interior and exterior painting solutions." However, I couldn't find any relevant information related to painting in the product data, including Denizen Women Blue Jeans (Product ID: 51499).

No relevant items found.

**Service ID:** 7002
**Service Name:** Car Painting Service
- **Category:** Automotive
- **Description:** Automotive painting service for a new look or repairs.

**Service ID:** 7001
**Service Name:** Home Painting Service
- **Category:** Home Improvement
- **Description:** Professional home interior and exterior painting solutions.

**Product ID:** 16508
**Product Name:** Locomotive Men Washed 