In [1]:
!pip install langchain langchain_community langchain_huggingface faiss-cpu pypdf
!pip install huggingface_hub
!pip install streamlit






In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
DATA_PATH="/content/drive/MyDrive/dataveda/data"
def load_pdf_files(data):
    loader = DirectoryLoader(data,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)

    documents=loader.load()
    return documents

documents=load_pdf_files(data=DATA_PATH)

In [5]:
print("Length of PDF pages: ", len(documents))


Length of PDF pages:  5515


In [6]:

# Step 2: Create Chunks
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=create_chunks(extracted_data=documents)
print("Length of Text Chunks: ", len(text_chunks))

Length of Text Chunks:  35240


In [7]:

# Step 3: Create Vector Embeddings

def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

embedding_model=get_embedding_model()



# Step 4: Store embeddings in FAISS
DB_FAISS_PATH="vectorestore"
db=FAISS.from_documents(text_chunks, embedding_model)
db.save_local(DB_FAISS_PATH)

In [8]:
from huggingface_hub import login

# Hugging Face token ko yahan dalen
login(token="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")

In [9]:
!pip install transformers accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline





In [10]:
!huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --include "original/*" --local-dir Llama-3.2-3B-Instruct

Fetching 4 files:   0% 0/4 [00:00<?, ?it/s]Fetching 4 files:   0% 0/4 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.11/dist-packages/requests/models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/0cb88a4f764b7a12671c53f0838cd831a0843b95/original/consolidated.00.pth

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 57, in main
    service.run()
  File "/usr/local/lib/python3.11/dist-

In [11]:
import os

from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

## Uncomment the following files if you're not using pipenv as your virtual environment manager
#from dotenv import load_dotenv, find_dotenv
#load_dotenv(find_dotenv())



In [12]:

# Step 1: Setup LLM (Llama-3.2-3B-Instruct with HuggingFace)
HF_TOKEN=os.environ.get("HF_TOKEN")
HUGGINGFACE_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"

In [13]:

def load_llm(huggingface_repo_id):
    llm=HuggingFaceEndpoint(
        repo_id=huggingface_repo_id,
        task="text-generation",
        temperature=0.5,
        model_kwargs={"token":HF_TOKEN,
                      "max_length":"512"}
    )
    return llm


In [14]:


# Step 2: Connect LLM with FAISS and Create chain

CUSTOM_PROMPT_TEMPLATE = """
Use the pieces of information provided in the context to answer user's question.
If you dont know the answer, just say that you dont know, dont try to make up an answer.
Dont provide anything out of the given context

Context: {context}
Question: {question}

Start the answer directly. No small talk please.
"""

def set_custom_prompt(custom_prompt_template):
    prompt=PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
    return prompt

# Load Database
DB_FAISS_PATH="vectorestore"
embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db=FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)




In [15]:
# Create QA chain
qa_chain=RetrievalQA.from_chain_type(
    llm=load_llm(HUGGINGFACE_REPO_ID),
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={'prompt':set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
)

In [29]:
user_query=input("Write Query Here: ")


Write Query Here: athrveda meaning 


In [30]:
response=qa_chain.invoke({'query': user_query})
#print("RESULT: ", response["result"])
#print("SOURCE DOCUMENTS: ", response["source_documents"])



In [31]:
print("RESULT: ", response["result"])
print("SOURCE DOCUMENTS: ", response["source_documents"])

RESULT:  Atharveda
SOURCE DOCUMENTS:  [Document(id='f3b5fbf3-862d-45ee-aadb-14c224a255fa', metadata={'producer': 'iText® 5.5.2 ©2000-2014 iText Group NV (ONLINE PDF SERVICES; licensed version)', 'creator': 'PDFMerge! (http://www.pdfmerge.com)', 'creationdate': '2015-01-24T20:53:01-05:00', 'moddate': '2018-02-01T21:25:35-08:00', 'source': '/content/drive/MyDrive/dataveda/data/Atharva Veda Vol 2 of 2.pdf', 'total_pages': 2428, 'page': 0, 'page_label': '1'}, page_content='|| AUM || \nATHARVA-VEDA \nअथर्वर्ेद: \nVol. I \n \n(With Original Sanskrit Text, Transliteration & \nLucid English Translation in the Aarsh Tradition \nof Maharshi Yaska and Swami Dayanan\nda) \n \nEnglish translation by \nDr. Tulsi Ram M.A., Ph.D. (London, U.K.) \n(Professor, Administrator, Researcher and Writer) \n \n \n \n \n \nExclusive Digital Distributor: \nAGNIVEER'), Document(id='e44c5ab3-8c1e-4691-96d7-0782ba6ff869', metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-03-1

In [32]:
!pip install gradio



In [33]:
import gradio as gr

def qa_interface(user_query):
    response = qa_chain.invoke({'query': user_query})
    result = response["result"]
    source_documents = response["source_documents"]
    return f"RESULT: {result}\n\nSOURCE DOCUMENTS: {source_documents}"

iface = gr.Interface(fn=qa_interface, inputs="text", outputs="text")
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3628f93df5362e7635.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


