In [None]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ), 
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [3]:
documents

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care.'),
 Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.')]

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from langchain_groq import ChatGroq
llm=ChatGroq(model="Llama3-8b-8192")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x0000015DD78E3790>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000015DD790D9C0>, model_name='Llama3-8b-8192', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [6]:
# Set HF cache
import os
os.environ['HF_HOME'] = 'D:/huggingface_cache'

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [8]:
# Vector stores 
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents, embeddings)
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x15d94fe7670>

In [9]:
vectorstore.similarity_search("cat")

[Document(id='c09f3dfd-8af3-4cd3-bd16-90f573ce8342', metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(id='96727245-80f6-4ff4-9828-00e2746c4ab9', metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(id='b670b57f-ba6d-468b-9a9b-483b37a64c70', metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(id='2536202c-3bdc-4cfd-862e-afee2eeafea5', metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.')]

In [10]:
# Async query
await vectorstore.asimilarity_search("cat")

[Document(id='c09f3dfd-8af3-4cd3-bd16-90f573ce8342', metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(id='96727245-80f6-4ff4-9828-00e2746c4ab9', metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(id='b670b57f-ba6d-468b-9a9b-483b37a64c70', metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(id='2536202c-3bdc-4cfd-862e-afee2eeafea5', metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.')]

In [11]:
## Retreviers
from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriver=RunnableLambda(vectorstore.similarity_search).bind(k=1)
retriver.batch(['cat', 'dog'])

[[Document(id='c09f3dfd-8af3-4cd3-bd16-90f573ce8342', metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(id='96727245-80f6-4ff4-9828-00e2746c4ab9', metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.')]]

In [13]:
retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={'k':1}
)
retriever.batch(["cat", "dog"])

[[Document(id='c09f3dfd-8af3-4cd3-bd16-90f573ce8342', metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(id='96727245-80f6-4ff4-9828-00e2746c4ab9', metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.')]]

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message="""
Answer this question using provided context only
{question}

- context:
{context} 
"""

prompt=ChatPromptTemplate(["human", message])

rag_chain = {"context":retriever, "question":RunnablePassthrough()}|prompt|llm

response = rag_chain.invoke("Tell me about dogs")

In [17]:
response.content

'Unfortunately, the provided context does not mention dogs. The context only talks about humans.'

In [None]:
# Import necessary libraries for document handling, LLM, embeddings, vector store, and RAG chain
from langchain_core.documents import Document
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from typing import List
from dotenv import load_dotenv
import os

# Dry Run: Load environment variables
# - load_dotenv() reads .env file containing GROQ_API_KEY for ChatGroq
# - Example: .env file has GROQ_API_KEY="abc123"
# - Output: Environment variable GROQ_API_KEY is set for use
load_dotenv()

# Dry Run: Set HuggingFace cache directory
# - Sets HF_HOME to 'D:/huggingface_cache' for storing embedding model files
# - Ensures embeddings are cached locally to avoid repeated downloads
# - Example: Directory 'D:/huggingface_cache' is used for model storage
os.environ['HF_HOME'] = 'D:/huggingface_cache'

# Dry Run: Define documents
# - Creates a list of 5 Document objects, each with page_content and metadata
# - Example document for dogs:
#   - page_content: "Dogs are great companions, known for their loyalty and friendliness."
#   - metadata: {"source": "mammal-pets-doc"}
# - Output: List of 5 Document objects stored in 'documents'
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

# Dry Run: Initialize language model
# - Creates ChatGroq instance with model "Llama3-8b-8192"
# - Uses GROQ_API_KEY from environment for authentication
# - Example: llm is configured to use Llama3-8b-8192 via Groq API
# - Output: llm is a ChatGroq object ready for text generation
llm = ChatGroq(model="Llama3-8b-8192")

# Dry Run: Initialize embeddings
# - Creates HuggingFaceEmbeddings with model "sentence-transformers/all-MiniLM-L6-v2"
# - This model converts text to 384-dimensional vectors for similarity search
# - Example: Text "Dogs are great companions" -> vector [0.1, -0.2, ..., 0.3]
# - Output: embeddings is an object for generating text embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Dry Run: Create vector store
# - Chroma.from_documents converts documents to embeddings and stores them
# - Each document's page_content is embedded using embeddings model
# - Example: "Dogs are great companions..." -> vector [0.1, -0.2, ..., 0.3] stored with metadata
# - Output: vectorstore is a Chroma object containing 5 document embeddings
vectorstore = Chroma.from_documents(documents, embeddings)

# Dry Run: Define context formatting function
# - Takes a list of Document objects and extracts page_content
# - Joins page_content strings with newlines
# - Example: Input [Document(page_content="Dogs are great...")] -> "Dogs are great..."
# - Output: A string containing concatenated document content
def format_context(documents: List[Document]) -> str:
    return "\n".join(doc.page_content for doc in documents)

# Dry Run: Create retriever
# - vectorstore.as_retriever creates a retriever for similarity search
# - search_type="similarity" uses cosine similarity
# - search_kwargs={'k': 1} returns top 1 matching document
# - Example: Query "Tell me about dogs" -> retrieves Document about dogs
# - Output: retriever is configured to fetch 1 document per query
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={'k': 1})

# Dry Run: Define prompt template
# - Creates a ChatPromptTemplate with a human message
# - Template instructs LLM to use only provided context, avoiding external knowledge
# - Placeholders {question} and {context} will be filled dynamically
# - Example: Template with question="Tell me about dogs", context="Dogs are great..."
#   - Output: Formatted prompt string for LLM
message = """
Answer the question using only the provided context. Do not use any external knowledge or make assumptions beyond the context.

Question: {question}

Context: {context}
"""
prompt = ChatPromptTemplate.from_template(message)

# Dry Run: Create RAG chain
# - Combines retriever, context formatting, question passthrough, prompt, and LLM
# - Structure:
#   - "context": retriever | format_context -> Retrieves document, converts to string
#   - "question": RunnablePassthrough() -> Passes query as-is
#   - Pipes to prompt, then to llm
# - Example: Input "Tell me about dogs" -> Chain processes through steps below
rag_chain = {
    "context": retriever | format_context,
    "question": RunnablePassthrough()
} | prompt | llm

# Dry Run: Invoke chain with query "Tell me about dogs"
# - Step 1: Retriever
#   - Query: "Tell me about dogs"
#   - Converts query to embedding: [0.12, -0.18, ..., 0.29]
#   - Searches vectorstore for closest document using cosine similarity
#   - Finds: Document(page_content="Dogs are great companions, known for their loyalty and friendliness.", metadata={"source": "mammal-pets-doc"})
#   - Output: [Document(...)]
# - Step 2: Format Context
#   - Input: [Document(page_content="Dogs are great...")]
#   - format_context: Joins page_content -> "Dogs are great companions, known for their loyalty and friendliness."
#   - Output: String "Dogs are great..."
# - Step 3: Chain Inputs
#   - Creates dictionary:
#     {
#         "context": "Dogs are great companions, known for their loyalty and friendliness.",
#         "question": "Tell me about dogs"
#     }
# - Step 4: Prompt Formatting
#   - Fills template with context and question
#   - Output:
#     """
#     Answer the question using only the provided context. Do not use any external knowledge or make assumptions beyond the context.
#
#     Question: Tell me about dogs
#
#     Context: Dogs are great companions, known for their loyalty and friendliness.
#     """
# - Step 5: LLM Invocation
#   - Prompt sent to ChatGroq (Llama3-8b-8192)
#   - LLM processes prompt, follows instruction to use only context
#   - Generates response: "Dogs are great companions, known for their loyalty and friendliness."
# - Output: response.content contains the LLM's answer
response = rag_chain.invoke("Tell me about dogs")

# Dry Run: Print response
# - response.content is the LLM's output
# - Expected Output: "Dogs are great companions, known for their loyalty and friendliness."
print(response.content)