In [1]:
!pip uninstall -y langchain langchain-openai langchain-community chromadb
!pip install -q langchain langchain-openai langchain-community chromadb pypdf rank_bm25

Found existing installation: langchain 1.2.10
Uninstalling langchain-1.2.10:
  Successfully uninstalled langchain-1.2.10
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m612.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m492.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m307.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m168.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.5/21.5 MB[0m [31m76.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.3/331.3 kB[0m [31m56.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m66.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

### **Hybrid Search in RAG**
(Vector Search + Keyword Search using LangChain + ChromaDB)

Hybrid search combines:

Semantic similarity (vector embeddings)

Keyword-based search (BM25 / lexical matching)

This improves retrieval quality because:

Vector search captures meaning

Keyword search captures exact terms

Together → higher recall + better precision

In [2]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": "Give me a short definition of DNN."}
    ]
)

print(response.choices[0].message.content)

A Deep Neural Network (DNN) is a type of artificial neural network with multiple layers of interconnected nodes or neurons, designed to model complex patterns and relationships in data. DNNs form the foundation of many modern machine learning applications, particularly in areas like image and speech recognition, natural language processing, and more.


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [4]:
from google.colab import files
uploaded = files.upload()

pdf_path = list(uploaded.keys())[0]
loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(documents)

Saving LangChain In Detail.pdf to LangChain In Detail.pdf


In [5]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [6]:
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 4

In [7]:
def hybrid_retrieval(query):
    vector_docs = vector_retriever.invoke(query)
    keyword_docs = bm25_retriever.invoke(query)

    # Merge results
    combined = vector_docs + keyword_docs

    # Remove duplicates
    unique_docs = list({doc.page_content: doc for doc in combined}.values())

    return unique_docs

In [8]:
prompt = ChatPromptTemplate.from_template("""
You are an expert assistant.

Use ONLY the context below to answer.
If not found, say you don't know.

Context:
{context}

Question:
{question}

Answer:
""")

In [9]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

hybrid_chain = (
    {
        "context": lambda x: format_docs(hybrid_retrieval(x)),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
query = "Explain the key points."

response = hybrid_chain.invoke(query)

print(response)

The key points from the provided context are as follows:

1. **Prompt Systems**: The context discusses the importance of prompts in guiding language models (LLMs) to produce desired outputs. Prompts can be dynamic, role-based, or few-shot, and they significantly influence the model's responses.

2. **LangChain Components**: The text outlines various components of LangChain, including:
   - **Chains**: Mechanisms that structure LLM tasks into pipelines, enabling the creation of agents that can perform actions beyond simple conversation.
   - **Indexes**: Allow LLM applications to connect with external knowledge sources, enhancing their ability to provide informed responses.
   - **Memory**: Addresses the stateless nature of LLMs by enabling them to remember previous interactions, which is crucial for maintaining context in conversations.

3. **Memory Types**: LangChain offers different memory mechanisms, such as:
   - **ConversationBufferMemory**: Stores the entire conversation history.