In [5]:
!pip install -U langchain langchain-community langchain-core langchain-text-splitters pypdf chromadb gradio


Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.29-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.

Task 1  - loading the documents

In [6]:
from langchain.document_loaders import PyPDFLoader

In [7]:
pdf_path = "/content/A_Comprehensive_Review_of_Low_Rank_Adaptation_in_Large_Language_Models_for_Efficient_Parameter_Tuning-1.pdf"

#LOAD PDF WITH PyPDFloader

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print("First page content :\n")
print(documents[0].page_content[:500])
print("\n Total pages loaded:" , len(documents))

First page content :

A Comprehensive Review of Low-Rank
Adaptation in Large Language Models for
Efficient Parameter Tuning
September 10, 2024
Abstract
Natural Language Processing (NLP) often involves pre-training large
models on extensive datasets and then adapting them for specific tasks
through fine-tuning. However, as these models grow larger, like GPT-3
with 175 billion parameters, fully fine-tuning them becomes computa-
tionally expensive. We propose a novel method called LoRA (Low-Rank
Adaptation) that signifi

 Total pages loaded: 11


**Task 2 ( text splitter )**

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
#create text splitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

#split into chunks
splitted_docs = text_splitter.split_documents(documents)

print(f" Total chunks created: {len(splitted_docs)}")
print("\n Sample chunk (first one ):\n ")
print(splitted_docs[0].page_content[:500])

 Total chunks created: 38

 Sample chunk (first one ):
 
A Comprehensive Review of Low-Rank
Adaptation in Large Language Models for
Efficient Parameter Tuning
September 10, 2024
Abstract
Natural Language Processing (NLP) often involves pre-training large
models on extensive datasets and then adapting them for specific tasks
through fine-tuning. However, as these models grow larger, like GPT-3
with 175 billion parameters, fully fine-tuning them becomes computa-
tionally expensive. We propose a novel method called LoRA (Low-Rank
Adaptation) that signifi


**tasks 3 ( generate embeddings )**

In [11]:
!pip install -q sentence-transformers

In [12]:
from langchain_community.embeddings import HuggingFaceEmbeddings

#initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#generate embedding for our chunks
docs_embeddings = embedding_model.embed_documents([doc.page_content for doc in splitted_docs])

print("Number of embeddings:", len(docs_embeddings))
print("size of each embedding vector:" , len(docs_embeddings[0]))
print("\n Sample embedding (frist 10 numbers):\n",  docs_embeddings[0][:10])

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of embeddings: 38
size of each embedding vector: 384

 Sample embedding (frist 10 numbers):
 [-0.046186283230781555, -0.09967904537916183, 0.015307209454476833, 0.048012156039476395, 0.06438607722520828, 0.039436426013708115, -0.0393260158598423, 0.018974635750055313, 0.0039568329229950905, -0.06212152540683746]


**Tasks 4 ( Chroma Vector DB)**

In [13]:
!pip install -q chromadb

In [14]:
from langchain.vectorstores import Chroma

In [15]:
#create Chroma vector DB
vectordb = Chroma.from_documents(
    documents = splitted_docs,
    embedding = embedding_model,
    collection_name = "research_paper"
)

print("Number of documents in vector DB : ", vectordb._collection.count())

Number of documents in vector DB :  38


**Task 5 (Retriever)**

In [17]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [18]:
# create retriver from vector DB
retriever = vectordb.as_retriever(search_type= "similarity", search_kwargs={"k":3})

#Test : fetch top 3 chunks for a query
query = "what this paper is talking about ?"
retrieved_docs = retriever.get_relevant_documents(query)

for i , doc in enumerate(retrieved_docs):
  print(f"--- chunk {i+1} ---")
  print(doc.page_content[:500])
  print("\n")

--- chunk 1 ---
ing it as a low-rank decomposition, W0 + ∆W = W0 + BA, where B ∈ Rd×r,
A ∈ Rr×k, and the rank r ≪ min(d, k). During training, W0 is fixed, and A
and B are the trainable parameters. Both W0 and ∆W = BA are multiplied
with the input, and their respective outputs are summed element-wise. Thus,
for h = W0x, our updated forward pass becomes:
h = W0x + ∆W x= W0x + BAx
We illustrate this reparametrization in Figure 1. We initializeA with random
Gaussian values and set B to zero, meaning ∆ W = BA is zer


--- chunk 2 ---
The principles outlined here apply generally to dense layers in neural networks,
although we focus on specific weights in Transformer language models, as these
models serve as the central example in our experiments.
4.1 Low-Rank Parameterized Update Matrices
Neural networks contain numerous dense layers that perform matrix multipli-
cation, and the weight matrices in these layers typically have a full rank. When
adapting to a particular task, it shows that pre-

  retrieved_docs = retriever.get_relevant_documents(query)


**`Task 6 ( Gardio QA Bot ) `**



In [23]:
!pip install -q transformers sentence-transformers gradio langchain


In [24]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create Hugging Face pipeline
hf_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)

# Wrap in LangChain LLM
llm = HuggingFacePipeline(pipeline=hf_pipeline)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [25]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True  # optional: show which chunks were used
)


In [27]:
def answer_question_safe(question):
    docs = retriever.get_relevant_documents(question)
    if len(docs) == 0:
        return "No relevant information found in the PDF."

    # Combine chunks but limit total length
    context = " ".join([doc.page_content for doc in docs])
    context = context[:3000]  # truncate to 3000 chars to avoid model overflow

    # Construct prompt for Hugging Face model
    prompt = f"Answer the question based on the context below.\n\nContext: {context}\n\nQuestion: {question}\nAnswer:"

    return llm(prompt)


In [28]:
iface = gr.Interface(
    fn=answer_question_safe,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question..."),
    outputs="text",
    title="Research Paper QA Bot",
    description="Ask questions from your uploaded PDF"
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://474163e0b8af547c07.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


