In [None]:
# 🔹 Step 1: Load PDF and Split into Text Chunks

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Path to your document
doc_path = "my_pdf.pdf"

# Load PDF file
loader = PyPDFLoader(doc_path)
docs = loader.load()

In [None]:
# Split the text into manageable chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,      # Each chunk will have 200 characters
    chunk_overlap=30     # Overlap between consecutive chunks
)
chunks = splitter.split_documents(docs)

chunks[:2]  # View first few chunks


In [None]:
# ⚙️ Embeddings and Vector Store

In [None]:
from dotenv import load_dotenv
load_dotenv()

import os

# Hugging Face API token
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in .env file or environment")

In [None]:
# 🔤 Step 4: Initialize HuggingFace Embeddings
from langchain_huggingface import HuggingFaceInferenceAPIEmbeddings

# Load embeddings model from HuggingFace
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN,
    model_name="BAAI/bge-base-en-v1.5"
)


In [None]:
# 💾 Step 5: Create a Chroma vector store
from langchain_chroma import Chroma

# Store document embeddings in Chroma for semantic retrieval
vectorstore = Chroma.from_documents(chunks, embeddings)

# Create a retriever to fetch the top 3 most relevant documents
vectorstore_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
vectorstore_retriever


In [None]:
# 🔍 Hybrid Search: Combining Vector and Keyword Retrieval

In [None]:
# 📦 Step 6: Install and import BM25 for keyword-based retrieval
# %pip install rank_bm25

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# Create a BM25 keyword retriever
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 3  # number of documents to retrieve

# Combine vector-based and keyword-based retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[vectorstore_retriever, keyword_retriever],
    weights=[0.3, 0.7]  # weighted combination for hybrid retrieval
)


In [None]:
# 🧮 Hybrid search formula reference:
# hybrid_score = (1 - alpha) * sparse_score + alpha * dense_score

In [None]:
# 🧩 Model Setup — Loading a 4-bit Quantized Language Model

In [None]:
# 📦 Step 7: Install dependencies for quantized model loading
# %pip install bitsandbytes
# %pip install accelerate

In [None]:
# ⚡ Step 8: Load Zephyr-7B-Beta model in 4-bit quantized mode
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from langchain_huggingface import HuggingFacePipeline

# Function to load a quantized model efficiently
def load_quantized_model(model_name: str):
    """
    Load a transformer model in 4-bit quantized mode for memory efficiency.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model


In [None]:
# Function to initialize a tokenizer
def initialize_tokenizer(model_name: str):
    """
    Initialize a tokenizer for the given model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # set beginning-of-sentence token
    return tokenizer


In [None]:
# 🔧 Step 9: Initialize model and tokenizer
model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = initialize_tokenizer(model_name)
model = load_quantized_model(model_name)


In [None]:
# 🧠 Building the Text Generation Pipeline

In [None]:
# ⚙️ Step 10: Create a text generation pipeline
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

# Wrap the pipeline with LangChain's LLM interface
llm = HuggingFacePipeline(pipeline=pipeline)


In [None]:
# 🔗 Building Retrieval-Augmented QA Chains

In [None]:
# 🔍 Step 11: Create normal (vector-based) and hybrid RAG chains
from langchain.chains import RetrievalQA

# Standard vector-based RAG
normal_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_retriever,
)

# Hybrid RAG combining dense + sparse retrieval
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=ensemble_retriever,
)


In [None]:
# 💬 Testing the RAG System

In [None]:
# 🧩 Step 12: Query using vector-based retriever
response1 = normal_chain.invoke("What is Abstractive Question Answering?")
print(response1.get("result"))


In [None]:
# 🧠 Step 13: Query using hybrid (vector + keyword) retriever
response2 = hybrid_chain.invoke("What is Abstractive Question Answering?")
print(response2.get("result"))
