In [1]:
import os

In [2]:
import faiss

In [3]:
import numpy as np

In [4]:
import re

In [5]:
from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [6]:
from langchain.document_loaders import PDFPlumberLoader

In [7]:
from langchain_ollama.llms import OllamaLLM 

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
# EMBEDDING_MODEL = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L12-v2")

In [10]:
LANGUAGE_MODEL = OllamaLLM(model="deepseek-r1:1.5b")

In [11]:
PDF_STORAGE_PATH = "C:\\Users\\aravi\\Downloads\\pdfwrite"

In [12]:
dimension = EMBEDDING_MODEL.get_sentence_embedding_dimension()
DOCUMENT_VECTOR_DB = faiss.IndexFlatL2(dimension)

In [13]:
doc_store = {}

In [14]:
def save_uploaded_file(uploaded_file, filename):
    """Saves uploaded PDF to a local directory."""
    file_path = os.path.join(PDF_STORAGE_PATH, filename)
    with open(file_path, "wb") as file:
        file.write(uploaded_file.read())
    return file_path

In [15]:
def load_pdf_documents(file_path):
    """Loads documents from a PDF file."""
    document_loader = PDFPlumberLoader(file_path)
    return document_loader.load()

In [16]:
def chunk_documents(raw_documents):
    """Splits documents into smaller text chunks."""
    text_processor = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=300,
        add_start_index=True
    )
    return text_processor.split_documents(raw_documents)

In [17]:
def index_documents(document_chunks):
    """Embeds documents, adds them to FAISS, and stores text mappings."""
    embeddings = np.array(EMBEDDING_MODEL.encode(
        [doc.page_content for doc in document_chunks], 
        convert_to_numpy=True, 
        normalize_embeddings=True
    ), dtype=np.float32)

    start_id = len(doc_store)  # Start index for new documents
    for i, doc in enumerate(document_chunks):
        doc_store[start_id + i] = doc.page_content  # Store text
    
    DOCUMENT_VECTOR_DB.add(embeddings) 

In [18]:
def find_related_documents(query, k=2):
    """Finds the most relevant documents using FAISS similarity search."""
    query_embedding = np.array(EMBEDDING_MODEL.encode(
        [query], 
        convert_to_numpy=True, 
        normalize_embeddings=True
    ), dtype=np.float32)

    _, indices = DOCUMENT_VECTOR_DB.search(query_embedding, k)

    results = [doc_store[idx] for idx in indices[0] if idx in doc_store]
    return results  

In [19]:
# def expand_query(query):
#     """Expands a short query into a more detailed search phrase."""
#     expansion_prompt = f"""
#     Given this short user query: "{query}"
#     Expand it into a more detailed question for better document retrieval.
#     """
#     return LANGUAGE_MODEL.invoke(expansion_prompt)

def prompt_template(user_query, context_text):
    """Formats user query and document context into a structured prompt."""
    return f"""You are an intelligent assistant tasked with answering user queries strictly based on the provided document.

### **Context (Extracted from Document)**
{context_text}

### **User Query**
{user_query}

### **Instructions for Answering**
- Use only the provided document content to generate a response.
- Do **not** introduce any external information or assumptions.
- Extract relevant details from the document and structure them into a **coherent, well-formed response**.
- Ensure the response is **clear and readable**, maintaining natural formatting without unnecessary line breaks (`\n`).
- Do **not** insert extra newlines unless they are essential for meaning or readability.
- Avoid excessive spacing, unnatural paragraph breaks, or disjointed formatting.
- If special characters like `\n` are found in the extracted content, **omit them unless needed for clarity**.
- If no relevant information is available, respond with: "I'm sorry, but I couldn't find an answer in the provided document."
- Do not provide explanations or context beyond what is necessary to answer the query.

### **Final Answer (Ensure clear, natural formatting without unnecessary newlines)**
"""



In [20]:
def generate_answer(user_query,related_docs):
    """Finds related documents, formats a prompt, and generates an AI response."""
    

    if not related_docs:
        return ["I'm sorry, but I couldn't find an answer in the provided document.", ""]

    context_text = "\n\n".join([doc for doc in related_docs])
    prompt = prompt_template(user_query, context_text)
    ai_response1 = LANGUAGE_MODEL.invoke(prompt)
    ai_response2 = LANGUAGE_MODEL.invoke(f"response from an AI model {ai_response1} refine the response to clear,concise and meaninglful by without changing or avoiding any information ")
    ai_response1 = re.split(r"</think>\s*", ai_response1, maxsplit=1)[-1].strip()
    ai_response2 = re.split(r"</think>\s*", ai_response2, maxsplit=1)[-1].strip()
    return [ai_response1,ai_response2, context_text]


In [21]:
uploaded_pdf = "C:\\Users\\aravi\\Downloads\\mypdf.pdf"
if uploaded_pdf:
    filename = os.path.basename(uploaded_pdf)
    file_path = save_uploaded_file(open(uploaded_pdf, "rb"), filename)
    print(f"PDF saved at: {file_path}")

PDF saved at: C:\Users\aravi\Downloads\pdfwrite\mypdf.pdf


In [27]:
# Load and process document
raw_docs = load_pdf_documents(file_path)


In [23]:
processed_chunks = chunk_documents(raw_docs)

In [28]:
index_documents(processed_chunks)

In [25]:
# User query example
user_input = "How to switch from Airtel Prepaid to Postpaid?"
related_docs = find_related_documents(user_input)

In [26]:
ai_response = generate_answer(user_input,related_docs)