# RAG BASED MODEL

In [5]:
import fitz # PyMuPDF

doc = fitz.open("Introduction to Software Engineering.pdf")

In [8]:
all_text = ""
for page in doc:
    all_text += page.get_text() + "\n"
doc.close()

In [9]:
print(all_text)

Introduction to Software Engineering
Definition of software:
•
Generally, people equate the term software with computer programs. However, a 
broader definition is: Software is not just the programs but also all associated 
documentation and configuration data that is needed to make these programs 
operate correctly.
•
A software system usually consists of a number of separate programs, configuration 
files, which are used to set up these programs, system documentation, which 
describes the structure of the system, and user documentation, which explains 
how to use the system and web sites for users to download recent product 
information.
•
Software engineers are concerned with developing software products, i.e., 
software which can be sold to a customer.

Software and its Types
Types of software product
Generic products
• These are stand-alone systems that are produced by a development organization and sold on the 
open market to any customer who is able to buy them.
•  Examples of t

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

#create a text splitter 
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n","\n","."," "]
)

#Split the text into chunks
chunks = splitter.split_text(all_text)

In [13]:
for i, chunks in enumerate(chunks):
    print(f"----Chunk {i}----")
    print(chunks)
    

----Chunk 0----
Introduction to Software Engineering
Definition of software:
•
Generally, people equate the term software with computer programs. However, a 
broader definition is: Software is not just the programs but also all associated 
documentation and configuration data that is needed to make these programs 
operate correctly.
•
A software system usually consists of a number of separate programs, configuration 
files, which are used to set up these programs, system documentation, which 
describes the structure of the system, and user documentation, which explains 
how to use the system and web sites for users to download recent product 
information.
•
Software engineers are concerned with developing software products, i.e., 
software which can be sold to a customer.
----Chunk 1----
Software and its Types
Types of software product
Generic products
• These are stand-alone systems that are produced by a development organization and sold on the 
open market to any customer who is abl

In [19]:
import fitz  # PyMuPDF


def load_and_chunk_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    chunks = []
    
    for page_num, page in enumerate(doc, start=1):
        text = page.get_text()
        if text.strip():
            chunks.append({
                "file_name": pdf_path.split("/")[-1],
                "page_num": page_num,
                "text": text,
            })
    doc.close()
    return chunks

In [20]:
pdf_chunks = load_and_chunk_pdf("Introduction to Software Engineering.pdf")
pdf_chunks[-5:]

[{'file_name': 'Introduction to Software Engineering.pdf',
  'page_num': 50,
  'text': 'Software Engineering Ethics\n•\nIn any situation where different people have different views and objectives, you are likely to \nbe faced with ethical dilemmas.\n•\n For example, if you disagree, in principle, with the policies of more senior management in \nthe company, how should you react? \n•\n Clearly, this depends on the particular individuals and the nature of the disagreement.\n•\n Is it best to argue a case for your position from within the organization or to resign in \nprinciple?\n•\n If you feel that there are problems with a software project, when do you reveal these to \nmanagement?\n•\nIf you discuss these while they are just a suspicion, you may be overreacting to a situation; if \nyou leave it too late, it may be impossible to resolve the difficulties. \nChapter 1 Introduction\n30/10/2014\n50\n'},
 {'file_name': 'Introduction to Software Engineering.pdf',
  'page_num': 51,
  'text':

In [22]:
from sentence_transformers import SentenceTransformer
import faiss
import json
import os

In [37]:
model = SentenceTransformer('all-MiniLM-L6-v2')
index_file = "my_index.faiss"
text = "texts.json"

#Load or create index
if os.path.exists(index_file):
    index = faiss.read_index(index_file)
    with open(text, 'r') as f:
        texts = json.load(f)
else:
    index = faiss.IndexFlatL2(384)


In [38]:
#Load texts

if os.path.exists(text):
    with open(text, 'r') as f:
        texts = json.load(f)
else:
    texts = []

In [39]:
def add_text(new_items:list[dict]):
    global texts, index
    
    #Extract texts and embeddings
    raw_texts = [item["text"] for item in new_items]
    embeddings = model.encode(raw_texts, normalize_embeddings=True).astype('float32')
    
    index.add(embeddings)
    texts.extend(new_items)
    
    faiss.write_index(index, index_file)
    with open(text, 'w') as f:
        json.dump(texts, f, ensure_ascii=False, indent=4)


add_text(pdf_chunks)

# Retrive

In [40]:
def retrive(query, k=5):
    query_embedding = model.encode([query], normalize_embeddings= True).astype('float32')
    D, I = index.search(query_embedding, k)
    return [texts[i] for i in I[0]]
retrive("What is software engineering?")

[{'file_name': 'Introduction to Software Engineering.pdf',
  'page_num': 4,
  'text': 'Fundamental Software Engineering Activities\nSoftware engineering \n• Software engineering is an engineering discipline that is concerned with all aspect of software \nproduction from the early stages of system specification to maintain in the system after it has \ngone into use.\n•  In this definition, there are two key phrases:\n• Engineering discipline:\n• Engineers make things work.\n• They apply theories, methods and tools where these are appropriate, but they use them \nselectively and always try to discover solutions to problems even when there are no applicable \ntheories and methods.\n•  Engineers also recognize that they must work to organizational and financial constraints, so they \nlook for solutions within these constraints.\n'},
 {'file_name': 'Introduction to Software Engineering.pdf',
  'page_num': 5,
  'text': 'Fundamental Software Engineering Activities\n• All aspects of software p

# Calling LLM with proper prompt


In [58]:
prompt_template = """ 
You are a helpful and intelligent study assistant. You are given a list of JSON objects as context, each representing 
extracted text from a pdf document.
each object contains 
- file_name: The name of the pdf file
- page_num: The page number in the pdf file
- text: The extracted text from the pdf file

your task is to answer the questions primarily based 'text' field of the JSON objects. You may reason and infer 
if exact answer is not present in the text, as long as it is relevant to the context.



#context
{context}

#question
{question}

Instructions:
1. Answer the question based on the provided context.
2. If the answer is not explicitly present, infer it based on the context.
3. Provide a concise and accurate response.
4. You may use information from the 'file_name' and 'page_num' fields to provide additional context if necessary.

return json object with
answer(str): helpful answer to the question
file_name(str): name of the file from which the answer is extracted
page_num(int): page number from which the answer is extracted

if the answer is not present in the context, return:
{{
"answer": "I don't know the answer to this question.",
"file_name": null,
"page_num": null
}}

Respond ONLY with the JSON object, do not include any additional text or explanation

"""

In [46]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

In [50]:
dotenv_path = "/home/anudip/Workshop/Day_2/.env"
load_dotenv(dotenv_path)
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.7,
    api_key=os.getenv("GEMINI_API_KEY"),
)

In [61]:
import json
import re

In [59]:
def json_to_obj(json_str: str):
    cleaned = re.sub(r"^```json\s*|\s*```$", "", json_str.strip(), flags=re.IGNORECASE)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON: ", e)
        return {}
    
def rag_answer(question):
    chunks = retrive(question)
    context = json.dumps(chunks)
    prompt = prompt_template.replace("context", context).replace("question", question)
    response = llm.invoke(prompt)
    obj = json_to_obj(response.content)
    return obj

In [60]:
rag_answer("What is software engineering?")

{'answer': 'Software engineering is an engineering discipline concerned with all aspects of software production, from initial system specification to maintenance after its release.  It involves not only technical processes but also project management and the development of supporting tools, methods, and theories.  While aiming for systematic and organized approaches, it also recognizes the need for creative solutions when facing complex problems or constraints.',
 'file_name': 'Introduction to Software Engineering.pdf',
 'page_num': 4}