## Simple RAG implementation using LangChain and HuggingFace

In [2]:
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import pipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

  from .autonotebook import tqdm as notebook_tqdm


#### Make global variables

In [3]:
# PDF_URL = "https://www.upl-ltd.com/images/people/downloads/Leave-Policy-India.pdf"
PDF_URL = "/Users/anupeshkumar.verma/Downloads/Personal/Anupesh_Resume.pdf"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"                 # "all-MiniLM-L6-v2"
LLM_MODEL_ID = "google/flan-t5-base"                                            # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

### Load and split PDF into chunks

In [None]:
def loadAndSplitPDF():
    documents = PyPDFLoader(PDF_URL).load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    return docs

### Create vector embeddings and store in FAISS vector store

In [5]:
def createStoreVectorEmbeddings():
    docs = loadAndSplitPDF()
    embeddings = HuggingFaceEmbeddings(model_name = EMBEDDING_MODEL_NAME)
    vector_store = FAISS.from_documents(docs, embeddings)
    return vector_store

### Select model and create pipeline and huggingface wrapper

In [6]:
def loadLLM():
    pipe = pipeline(
        "text2text-generation",
        model=LLM_MODEL_ID,
        tokenizer=LLM_MODEL_ID,
        max_new_tokens=256,     # Reduced max_new_tokens for conciseness
        # temperature=0.1,       # Lower temperature for less randomness and more factual answers
        do_sample=False,
        device = -1             # set to -1 if no GPU available
    )
    rag_llm = HuggingFacePipeline(pipeline=pipe)
    return rag_llm


### Create a RetrievalQA chain combining the vector store retriever and the LLM
- k indicates the number of top documents to retrieve that are most similar to the query. In this case, k=2 means the retriever will return the 2 most relevant documents from the vector store.

In [7]:
def createRetrievalQAChain():
    vector_store = createStoreVectorEmbeddings()
    rag_llm = loadLLM()

    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    rag_chain = RetrievalQA.from_chain_type(
        llm=rag_llm,
        chain_type="stuff",  # simple concatenation of retrieved docs
        retriever=retriever,
        return_source_documents=True,
    )
    return rag_chain

In [8]:
def generateResponse(query):
    rag_chain = createRetrievalQAChain()
    response = rag_chain.invoke({"query": query})
    print(f"Question: {query}\nAnswer: {response["result"]}")

### Ask queries

In [9]:
# Query RAG
query = "What types of leaves are covered in this policy?"
query = "What is the current role of Anupesh ?"
generateResponse("Hi")
 

Device set to use cpu


Question: Hi
Answer: Hi Anupesh Kumar Verma Anupesh Kumar Verma Anupesh Kumar Verma121@gmail.com Anupesh Kumar Verma121@gmail.com /ne+91 9794371985 7Portfolio | /nednLinkedIn | /gtbGitHub Education • Motilal Nehru National Institute of Technology Allahabad (NIT Allahabad) 2024 Bachelor of Technology (B.Tech) CGPA: 7.9/10 Electronics and Communication Engineering Experience • Data Engineer Trainee | Personify Health Jan 2025 - Present  Collaboration: Working closely with data analysts and other stakeholders to understand data requirements and deliver actionable insights.
