STEP 1: Install required libraries

In [None]:
!pip install -q langchain langchain-core langchain-community \
langchain-huggingface langchain-text-splitters \
pypdf sentence_transformers chromadb \
huggingface_hub transformers accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.6/330.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.4/566.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

STEP 2: Import libraries

In [None]:
from google.colab import files
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os



STEP 3: Upload document (PDF or TXT)


In [8]:
uploaded = files.upload()

filename = list(uploaded.keys())[0]

if filename.endswith(".pdf"):
    loader = PyPDFLoader(filename)
elif filename.endswith(".txt"):
    loader = TextLoader(filename)
else:
    raise ValueError("Only PDF or TXT files supported")

documents = loader.load()
print(f"Loaded {len(documents)} document(s)")

Saving Major Project phase-2 Documentation-new-1.pdf to Major Project phase-2 Documentation-new-1.pdf
Loaded 53 document(s)


STEP 4: Split document into chunks

Eploring Fixed Size chunking

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks")

Split into 108 chunks


### Exploring Semantic Chunking

Let's try a different chunking strategy focusing on semantic coherence.

In [10]:
from langchain_text_splitters import SentenceTransformersTokenTextSplitter

# Initialize the semantic text splitter
semantic_text_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0,
    tokens_per_chunk=256 # Adjust as needed, based on typical sentence length and model limits
)

# Split the documents using the semantic splitter
semantic_texts = semantic_text_splitter.split_documents(documents)

print(f"Split into {len(semantic_texts)} semantic chunks")

# Optionally, you can now create a new vector database with these semantic chunks
# semantic_vectordb = Chroma.from_documents(
#     documents=semantic_texts,
#     embedding=embeddings
# )
# semantic_retriever = semantic_vectordb.as_retriever()
# print("Semantic Vector DB ready")

Split into 95 semantic chunks


STEP 5: Create embeddings + vector database


In [17]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectordb = Chroma.from_documents(
    documents=semantic_texts, #changethe variable name associated with which chunking method u have used
    embedding=embeddings
)

retriever = vectordb.as_retriever()
print("Vector DB ready with semantic chunks")

Vector DB ready with semantic chunks


STEP 6: Load local LLM (FLAN-T5)

In [19]:
model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=pipe)
print("LLM loaded")


Device set to use cpu


LLM loaded


STEP 7: Create RAG chain


In [20]:
template = """Use the following context to answer the question. If the answer is not in the context, say "I don't know." Don't try to make up an answer.

Context:
{context}

Question:
{question}

Answer:"""

prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain ready with updated prompt")

RAG chain ready with updated prompt


STEP 8: Ask user questions

In [21]:
while True:
    query = input("\nAsk a question (type 'exit' to stop): ")

    if query.lower() == "exit":
        break

    answer = rag_chain.invoke(query)
    print("\nAnswer:", answer)



Ask a question (type 'exit' to stop): what isuse of metamask?


Token indices sequence length is longer than the specified maximum sequence length for this model (726 > 512). Running this sequence through the model will result in indexing errors



Answer: It is a cryptocurrency wallet that enables users to store ether and other ethereum request for comments 20 tokens. it is a browser plugin that serves as an ethereum wallet, and is installed like any other browser plugin. it can also be used to interact with decentralized applications ( dapp ). metamask is a reliable tool with an easy - to - navigate user interface and consistent customer support. metamask helps you to access your funds without any hassle, as it does not require you to manage private keys at each and every transaction that you undertake. instead, it automatically signs all transactions and pops up a confirmation window when you mak e a payment. however, you have to remember a set of words that will prove your identity. this wallet is widely used by people who want to make secure cryptocurrency payments. it is safe to use because it works with the ethereum blockchain. this eliminates the need to download entire blockchains, minimizes the risk of malware, and pro

### Understanding Semantic Chunking Methods

Semantic chunking aims to split text into coherent, meaningful segments, ensuring that each chunk contains a complete thought or idea. This is often preferred over fixed-size chunking (like `RecursiveCharacterTextSplitter`) because it helps preserve the context that an LLM needs to understand and answer questions accurately.

Here are some common semantic chunking methods:

1.  **Sentence-based Chunking:**
    *   **How it splits:** The simplest form, where each sentence forms a chunk. It's granular but might break apart ideas that span multiple sentences if the context window is very small.
    *   **Pros:** Preserves sentence integrity, easy to implement.
    *   **Cons:** Can create many small chunks, potentially losing broader context.

2.  **Paragraph-based Chunking:**
    *   **How it splits:** Splits documents into chunks based on paragraph breaks. This often results in more semantically coherent chunks than sentence-based.
    *   **Pros:** Generally good at keeping related ideas together.
    *   **Cons:** Paragraphs can be very long or very short, leading to uneven chunk sizes.

3.  **Recursive Splitting with Semantic Boundaries:**
    *   **How it splits:** This method is more advanced. It starts with large chunks (e.g., by paragraph or header) and then recursively splits them into smaller pieces if they exceed a certain size. The key difference from plain `RecursiveCharacterTextSplitter` is that it might use language models or embedding similarity to identify natural break points (e.g., points of lowest semantic similarity) within a larger chunk.
    *   **Pros:** Flexible, can adapt to different document structures, aims for optimal semantic coherence.
    *   **Cons:** More complex to implement, can be computationally intensive if involving LLMs or embeddings for splitting decisions.

4.  **Token-based Splitting with Semantic Awareness (e.g., `SentenceTransformersTokenTextSplitter`):**
    *   **How it splits:** This method, like the one we used (`SentenceTransformersTokenTextSplitter`), is designed to create chunks that respect token limits while trying to maintain semantic integrity. It often leverages underlying Sentence Transformer models to understand sentence boundaries and create chunks based on a maximum number of tokens, prioritizing full sentences or sub-sentences.
    *   **Pros:** Efficiently handles token limits, good balance between chunk size and semantic coherence, directly compatible with embedding models.
    *   **Cons:** Still primarily driven by token count, so it might not always perfectly capture very long, complex semantic units across strict token boundaries.

### How `SentenceTransformersTokenTextSplitter` Splits Our Data

In our notebook, the `SentenceTransformersTokenTextSplitter` was configured with `tokens_per_chunk=256`. This means it attempts to create chunks that are roughly 256 tokens long, while trying to respect sentence boundaries. Let's inspect some of these semantic chunks to see how they differ from the fixed-size chunks.

In [22]:
# Print the first 3 semantic chunks
print("First 3 Semantic Chunks (using SentenceTransformersTokenTextSplitter):")
for i, chunk in enumerate(semantic_texts[:3]):
    print(f"\n--- Semantic Chunk {i+1} ---")
    print(f"Length: {len(chunk.page_content.split())} words / {len(chunk.page_content)} characters")
    print(chunk.page_content)

# For comparison, print the first 3 fixed-size chunks (if available)
if 'texts' in locals() and len(texts) > 0:
    print("\n\nFirst 3 Fixed-Size Chunks (using RecursiveCharacterTextSplitter):")
    for i, chunk in enumerate(texts[:3]):
        print(f"\n--- Fixed-Size Chunk {i+1} ---")
        print(f"Length: {len(chunk.page_content.split())} words / {len(chunk.page_content)} characters")
        print(chunk.page_content)
else:
    print("\nFixed-size chunks (variable 'texts') not available for comparison.")

First 3 Semantic Chunks (using SentenceTransformersTokenTextSplitter):

--- Semantic Chunk 1 ---
Length: 115 words / 757 characters
i a project phase - ii report on decentralized smart contract certificate system using ethereum blockchain technology submitted to the department of computer science & engineering, gnits in the partial fulfillment of the academic requirement for the award of b. tech ( cse ) under jntuh by mothukuri ashritha ( 19251a0598 ) saieni alankruthi ( 19251a05b0 ) thummanapalli preethi ( 19251a05b8 ) akhila athinarapu ( 20255a0507 ) under the guidance of dr. raghavender k. v. associate professor, department of cse department of computer science and engineering g. narayanamma institute of technology & science ( autonomous ) ( for women ) shaikpet, hyderabad - 500104. affiliated to jawaharlal nehru technological university hyderabad hyderabad 500085 may, 2023

--- Semantic Chunk 2 ---
Length: 158 words / 1036 characters
ii g. narayanamma institute of technology & scie