In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage


def load_and_chunk_document(file_path, chunk_size=1000, chunk_overlap=200):
    """
    Load and split the document into overlapping chunks.
    """
    print("Loading and chunking document...")
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Document split into {len(chunks)} chunks.")
    return chunks
    

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
file_path = "C://Users//bvcha//Desktop//DocumetQA//uploaded_pdfs//BVCHANDRAHAAS_RESUME.pdf"
print(load_and_chunk_document(file_path))


Loading and chunking document...
Document split into 4 chunks.
[Document(metadata={'source': 'C://Users//bvcha//Desktop//DocumetQA//uploaded_pdfs//BVCHANDRAHAAS_RESUME.pdf', 'page': 0}, page_content='B.V.CHANDRAHAAS\nNellore, Andhra Pradesh\n♂phone+91 7993577106 /envel⌢pebvchandrahaas@gmail.com /linkedinlinkedin.com/in/bvchandrahaas /githubgithub.com/BVChandrahaas\nEducation\nVellore Institute of Technology Sep 2020 – May 2024\nB.Tech[ CSE with Spec. in Data Analytics] — CGPA: 8.52 Amaravati, Andhra Pradesh\nSri Chaitanya Junior Kalasala June 2018 – Mar 2020\nClass 12, MPC, TSBIE — 87.4 % Hyderabad, Telangana\nSri Chaitanya Techno School Mar 2018\nClass 10, TS.SSC — CGPA: 9.5 Hyderabad, Telangana\nRelevant Coursework\n• Data Structures\n• Machine Learning\n• DBMS\n• Neural Networks\n• Artificial Intelligence\n• Deep Learning\n• Software Engineering\n• NLP\nProjects\nDeciphering the growing popularity of laptops Jan 2021\n• Collected Data from 200 users aged between 18 to 51.\n• Perform

In [4]:
def generate_embeddings_and_store(chunks):
    """
    Generate embeddings for chunks and store in FAISS.
    """
    print("Generating embeddings and storing in FAISS...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)

    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(embeddings)

    metadata = [{"text": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]
    print("Embeddings generated and stored successfully.")
    return index, metadata, model

In [5]:
chunks = load_and_chunk_document(file_path)
print(generate_embeddings_and_store(chunks))

Loading and chunking document...
Document split into 4 chunks.
Generating embeddings and storing in FAISS...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.91it/s]

Embeddings generated and stored successfully.
(<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001D670B57FC0> >, [{'text': 'B.V.CHANDRAHAAS\nNellore, Andhra Pradesh\n♂phone+91 7993577106 /envel⌢pebvchandrahaas@gmail.com /linkedinlinkedin.com/in/bvchandrahaas /githubgithub.com/BVChandrahaas\nEducation\nVellore Institute of Technology Sep 2020 – May 2024\nB.Tech[ CSE with Spec. in Data Analytics] — CGPA: 8.52 Amaravati, Andhra Pradesh\nSri Chaitanya Junior Kalasala June 2018 – Mar 2020\nClass 12, MPC, TSBIE — 87.4 % Hyderabad, Telangana\nSri Chaitanya Techno School Mar 2018\nClass 10, TS.SSC — CGPA: 9.5 Hyderabad, Telangana\nRelevant Coursework\n• Data Structures\n• Machine Learning\n• DBMS\n• Neural Networks\n• Artificial Intelligence\n• Deep Learning\n• Software Engineering\n• NLP\nProjects\nDeciphering the growing popularity of laptops Jan 2021\n• Collected Data from 200 users aged between 18 to 51.\n• Performed Visual analysis and obtai




In [13]:
index,metadata,model = generate_embeddings_and_store(chunks)
query = "Tell me about India "
def retrieve_relevant_chunks(index, metadata, query, model, k=5):
    """
    Retrieve the most relevant chunks using FAISS and query embeddings.
    """
    print("Retrieving relevant chunks...")
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k=k)

    # Fetch metadata for the top results
    retrieved_docs = [metadata[idx] for idx in indices[0]]

    # Debug: Print retrieved chunks
    for idx, doc in enumerate(retrieved_docs):
        print(f"DEBUG - Chunk {idx + 1}: {doc['text'][:200]}...")
    return retrieved_docs
print(retrieve_relevant_chunks(index,metadata,query,model))

Generating embeddings and storing in FAISS...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.20it/s]

Embeddings generated and stored successfully.
Retrieving relevant chunks...
DEBUG - Chunk 1: B.V.CHANDRAHAAS
Nellore, Andhra Pradesh
♂phone+91 7993577106 /envel⌢pebvchandrahaas@gmail.com /linkedinlinkedin.com/in/bvchandrahaas /githubgithub.com/BVChandrahaas
Education
Vellore Institute of Tech...
DEBUG - Chunk 2: • Performed Visual analysis and obtained key insights.
A Study on Eye wear purchases from Lenskart Oct 2022
• Conducted comprehensive marketing research project, leveraging data from Lenskart users.
•...
DEBUG - Chunk 3: ∗ Published in EAI endorsed transactions of IOT.
Certifications
1) Oracle Cloud Infrastructure Certified Generative AI
2) Introduction to Data Analytics
3) SmartBridge AI Externship Program
4) Summer ...
DEBUG - Chunk 4: • A Project focused on summarizing videos using State-of-the-art generative AI models
A System and Method for Multi-Class Paddy Disease Detection using MLOps April 2024
• Proposed a new Ensemble frame...
DEBUG - Chunk 5: ∗ Published in EAI endo




In [14]:
retrieved_docs = retrieve_relevant_chunks(index,metadata,query,model)
def combine_chunks(retrieved_docs):
    """
    Combine chunks into a single string with proper structure.
    """
    combined_context = ""
    for idx, doc in enumerate(retrieved_docs):
        text = doc.get("text", "").strip()
        if text:
            combined_context += f"Chunk {idx + 1}:\n{text}\n\n"
    return combined_context
combined_context = combine_chunks(retrieved_docs)
print(combined_context)

Retrieving relevant chunks...
DEBUG - Chunk 1: B.V.CHANDRAHAAS
Nellore, Andhra Pradesh
♂phone+91 7993577106 /envel⌢pebvchandrahaas@gmail.com /linkedinlinkedin.com/in/bvchandrahaas /githubgithub.com/BVChandrahaas
Education
Vellore Institute of Tech...
DEBUG - Chunk 2: • Performed Visual analysis and obtained key insights.
A Study on Eye wear purchases from Lenskart Oct 2022
• Conducted comprehensive marketing research project, leveraging data from Lenskart users.
•...
DEBUG - Chunk 3: ∗ Published in EAI endorsed transactions of IOT.
Certifications
1) Oracle Cloud Infrastructure Certified Generative AI
2) Introduction to Data Analytics
3) SmartBridge AI Externship Program
4) Summer ...
DEBUG - Chunk 4: • A Project focused on summarizing videos using State-of-the-art generative AI models
A System and Method for Multi-Class Paddy Disease Detection using MLOps April 2024
• Proposed a new Ensemble frame...
DEBUG - Chunk 5: ∗ Published in EAI endorsed transactions of IOT.
Certifications
1) Or

In [16]:
context = combined_context
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

def generate_answer(context, query):
    """
    Generate an answer using OpenAI GPT-4 chat model.
    If the context doesn't have an answer, the model replies accordingly.
    """
    # Define the chat-based prompt template
    prompt = ChatPromptTemplate.from_template(
        """
        You are a document assistant. Use only the information in the provided context to answer the question.
        If the context does not contain the answer, reply: 
        "I'm sorry, I do not know the answer based on the provided context."

        Context: {context}

        Question: {question}

        Answer:
        """
    )

    # Initialize the ChatOpenAI model
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.4, openai_api_key="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")

    # Format the prompt
    formatted_prompt = prompt.format(context=context, question=query)
    
    # Generate response
    response = llm.invoke(formatted_prompt)

    return response.content
print(generate_answer(context,query))


I'm sorry, I do not know the answer based on the provided context.
