STEP 1: Install required libraries

In [1]:
!pip install -q langchain langchain-core langchain-community \
langchain-huggingface langchain-text-splitters \
pypdf sentence_transformers chromadb \
huggingface_hub transformers accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m71.8 MB/s[0m eta [36m0:00:0

STEP 2: Import libraries

In [2]:
from google.colab import files
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os



STEP 3: Upload document (PDF or TXT)


In [3]:
uploaded = files.upload()

filename = list(uploaded.keys())[0]

if filename.endswith(".pdf"):
    loader = PyPDFLoader(filename)
elif filename.endswith(".txt"):
    loader = TextLoader(filename)
else:
    raise ValueError("Only PDF or TXT files supported")

documents = loader.load()
print(f"Loaded {len(documents)} document(s)")

Saving Major Project phase-2 Documentation-new-1.pdf to Major Project phase-2 Documentation-new-1.pdf
Loaded 53 document(s)


STEP 4: Split document into chunks

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks")

Split into 108 chunks


STEP 5: Create embeddings + vector database


In [5]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embeddings
)

retriever = vectordb.as_retriever()
print("Vector DB ready")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector DB ready


STEP 6: Load local LLM (FLAN-T5)

In [6]:
model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=pipe)
print("LLM loaded")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


LLM loaded


STEP 7: Create RAG chain


In [7]:
template = """Use the following context to answer the question.

Context:
{context}

Question:
{question}

Answer:"""

prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain ready")


RAG chain ready


STEP 8: Ask user questions

In [8]:
while True:
    query = input("\nAsk a question (type 'exit' to stop): ")

    if query.lower() == "exit":
        break

    answer = rag_chain.invoke(query)
    print("\nAnswer:", answer)



Ask a question (type 'exit' to stop): what is there in document?


Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors



Answer: iv.

Ask a question (type 'exit' to stop): project name?

Answer: GNITS

Ask a question (type 'exit' to stop): what is DAPP?

Answer: a digital applications or programs that run on a decentralized network, rather than a single computer or server

Ask a question (type 'exit' to stop): full form of DAPP?

Answer: Decentralized Application

Ask a question (type 'exit' to stop): teechnology used?

Answer: [iii]

Ask a question (type 'exit' to stop): ganache?

Answer: a private Ethereum blockchain environment that allows to you emulate the Ethereum blockchain so that you can interact with smart contracts in your own private blockchain

Ask a question (type 'exit' to stop): eit

Answer: is a decentralized blockchain platform that establishes a peer -to- peer network that securely executes and verifies application code called smart contracts

Ask a question (type 'exit' to stop): exit
