# Data Preparation


*   Download required packages
*   Read text from PDF documents
*   Divide PDF document into smaller chunks  









In [None]:
!pip install langchain --quiet
!pip install langchain-community --quiet
!pip install python-dotenv --quiet

In [None]:
!pip install pypdf --quiet
!pip install pypdf2 --quiet

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from dotenv import load_dotenv
import os
import pickle

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Define the folder path in Google Drive
folder_path = '/content/drive/My Drive/Book_Class_10/'

In [None]:
def is_pdf(file_path):
    return os.path.isfile(file_path) and (os.path.splitext(file_path)[1] == '.pdf' or os.path.splitext(file_path)[1] == '.PDF')

In [None]:
def get_pdf_text(folder_path):
    pdf_docs = []
    for file in os.listdir(folder_path):
        file_name = os.path.join(folder_path,file)
        if is_pdf(file_name):
            pdf_docs.append(file)
    # pdf_docs = os.listdir(folder_path)
    print(pdf_docs)
    print(len(pdf_docs))
    text = ""
    pages = []
    for pdf in pdf_docs:
        try:
            pdf_reader = PyPDFLoader(os.path.join(folder_path,pdf))
            pages += pdf_reader.load_and_split()
        except Exception as e:
            print(f"Error reading {pdf}: {e}")
    return pages

In [None]:
def get_text_chunks(text):

    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
    docs = text_splitter.split_documents(text)

    return pages

In [None]:
# def get_text_chunks(text):
#     # Initialize splitters
#     char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#     recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
#                                                              chunk_overlap=100,
#                                                              separators=["\n\n", "\n", ". ", " ", ""]  # Ensure proper sentence splitting
#                                                              )

#     # First convert text to documents if it's raw text
#     if isinstance(text, str):
#         docs = char_text_splitter.create_documents([text])
#     else:
#         docs = char_text_splitter.split_documents(text)

#     # Further split the documents
#     pages = recursive_text_splitter.split_documents(docs)
#     return pages


In [None]:
# # Create pickle file of all the text chunks
# def create_pickle(folder_name):

#     text = []
#     pages = []

#     text = get_pdf_text(folder_name)
#     pages += get_text_chunks(text)

#     print(len(pages))

#     with open('sci_book_class10', 'wb') as fp:
#         pickle.dump(pages, fp)

In [None]:
def create_pickle(folder_name, pickle_file):
    # Get text from PDFs
    text = get_pdf_text(folder_name)

    # Get chunks
    pages = get_text_chunks(text)

    print(f"Total chunks created: {len(pages)}")

    # Create pickle filename based on folder_name
    # pickle_file = f'{folder_name}_chunks.pkl'

    # Save to pickle
    with open(folder_name+pickle_file, 'wb') as fp:
        pickle.dump(pages, fp)

    print(f"Saved chunks to {pickle_file}")


In [None]:
# pages = create_pickle(folder_path)
# Extract folder name
folder_name = os.path.basename(os.path.normpath(folder_path.rstrip('/')))

print(folder_name)  # Output: 'Book_Class_10'

Book_Class_10


In [None]:
## Uncomment this code to create pickle file
pickle_file = f'{folder_name}_chunks.pkl'
create_pickle(folder_path, pickle_file)

['jesc1ps.pdf', 'jesc1an.pdf', 'jesc101.pdf', 'jesc102.pdf', 'jesc103.pdf', 'jesc104.pdf', 'jesc105.pdf', 'jesc106.pdf', 'jesc107.pdf', 'jesc108.pdf', 'jesc109.pdf', 'jesc111.pdf', 'jesc110.pdf', 'jesc112.pdf', 'jesc113.pdf']
15
Total chunks created: 652
Saved chunks to Book_Class_10_chunks.pkl


In [None]:
print(pickle_file)

Book_Class_10_chunks.pkl


In [None]:
print(folder_path+pickle_file)

/content/drive/My Drive/Book_Class_10/Book_Class_10_chunks.pkl


In [None]:
with open (folder_path+pickle_file, 'rb') as fp:
    pages = pickle.load(fp)

In [None]:
# with open (folder_path+'_chunks.pkl', 'rb') as fp:
#     pages = pickle.load(fp)

In [None]:
len(pages)

652

In [None]:
print(pages[1])

page_content='First Edition
December 2006 Agrahayana 1928
Reprinted
November 2007, January 2009,
December 2009, November 2010,
January 2012, November 2012,
October 2013, December 2014,
December 2015, February 2017,
January 2018, January 2019,
August 2019, January 2021 and
November 2021
Revised Edition
October 2022, Kartika 1944
Reprinted
March 2024 Chaitra 1946
PD 700T  SU
© National Council of Educational
Research and Training, 2006, 2022
`     210.00
ALL RIGHTS RESERVED
q No part of this publication may be reproduced, stored in a retrieval system or
transmitted, in any form or by any means, electronic, mechanical, photocopying,
recording or otherwise without the prior permission of the publisher.
q This book is sold subject to the condition that it shall not, by way of trade,  be lent,
re-sold, hired out or otherwise disposed of without the publisher’s consent, in any
form of binding or cover other than that in which it is published.' metadata={'producer': 'GPL Ghostscript 8.15', 'cr

# Sentence Embeddings and Pinecone Index


*   Install required packages
*   Get Access token from hugging face
*   Create pinecone index
*   Store embeddings in pinecone vectorstore





In [None]:
!pip install sentence-transformers --quiet
!pip install chromadb --quiet
# !pip install pinecone-client --quiet
# !pip install langchain_pinecone --quiet

In [None]:
!pip install langchainhub --quiet

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
# from pinecone import Pinecone
# from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from google.colab import userdata
import time

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
embeddings = HuggingFaceEmbeddings(
 model_name=modelPath,
 model_kwargs=model_kwargs
)

In [None]:
# os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
# pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [None]:
# index_name = 'gemma-book'

In [None]:
# existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# if index_name not in existing_indexes:
#     pc.create_index(
#         name=index_name,
#         dimension=768,
#         metric="cosine"
#     )
#     while not pc.describe_index(index_name).status["ready"]:
#         time.sleep(1)

# index = pc.Index(index_name)

In [None]:
# docsearch = PineconeVectorStore.from_documents(pages, embeddings, index_name=index_name)

In [None]:
docsearch = Chroma.from_documents(pages, embeddings)

# RAG Model and Query Processing


*   Get access tokens from Hugging face
*   Initialize the tokenizer with the model.
*   Create a text generation pipeline.
*   Initialize the LLM with pipeline and model arguments
*   The final step is to generate the answers using both the vector store and the LLM. It will generate embeddings to the input query or question retrieve the context from the vector store, and feed this to the LLM to generate the answers








In [None]:
!pip install -U langchain-huggingface --quiet

In [None]:
from huggingface_hub import notebook_login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
import torch
import re


In [None]:
os.environ['HUGGINGFACE_HUB_TOKEN'] = userdata.get('HUGGINGFACE_HUB_TOKEN')
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Initialize the tokenizer with the model

model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# model = AutoModelForCausalLM.from_pretrained("google/gemma-3-4b-it")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", padding=True, truncation=True, max_length=512)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
# Create a text generation pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # max_new_tokens=512,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda"
)

Device set to use cuda


In [None]:
# Initialize the LLM with pipeline and model arguments

llm = HuggingFacePipeline(
    pipeline=pipe,
    model_kwargs={"temperature": 0.2},
)

In [None]:
def extract_answer(response):
    match = re.search(r"Assistant:(.*)", response, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return response

In [None]:
retriever = docsearch.as_retriever(search_kwargs={'k': 7})

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA

# retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 4, 'fetch_k': 20})
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)



In [None]:
rag_chain.invoke("What are acids?")

"Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: What are acids? \nContext: Carbon and its Compounds 73\n4.4.2 Properties of Ethanoic Acid\nEthanoic acid is commonly called acetic acid and\nbelongs to a group of acids called carboxylic\nacids. 5-8% solution of acetic acid in water is\ncalled vinegar and is used widely as a preservative\nin pickles. The melting point of pure ethanoic acid\nis 290 K and hence it often freezes during winter\nin cold climates. This gave rise to its name glacial\nacetic acid.\nThe group of organic compounds called\ncarboxylic acids are obviously characterised by\ntheir acidic nature. However, unlike mineral acids\nlike HCl, which are completely ionised, carboxylic\nacids are weak acids.\nActivity 4.7Activity 4.7Activity 4.7Activity 4.7Activity 4.7\nn Co

In [None]:
# retriever = docsearch.as_retriever(search_kwargs={"k": 7})

prompt_template = """Answer the question based only on the following context:\n

{context}

Always generate the precise answer based on contex and do not make up an answer.
Only show the answer and nothing else

Question: {question}

Assistant:
"""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
    )
chain_type_kwargs = {"prompt": PROMPT}

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)


In [None]:
question = "What are acids."
extract_answer(qa.invoke(question)['result'])


In [None]:
qa.invoke(question)['result']