In [12]:
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
import io
import re
import uuid
from qdrant_client.models import VectorParams, Distance
from langchain.vectorstores import Qdrant
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.agents import Tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_groq import ChatGroq



In [13]:
# Load environment variables from .env file
load_dotenv()

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

# initialize Ollama LLM and embeddings
Groq_llm = ChatGroq(model="qwen-qwq-32b")
ollama_llm = OllamaLLM(model="llama3.1")
ollama_embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Global variables
pdf_id=None
collection_name = "pdf_documents"

# Create a new collection if it doesn't exist
try:
    qdrant_client.get_collection(collection_name)
except Exception as e:
    qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
    )



In [14]:
def load_pdf(pdf_bytes: bytes):

    # Generate a unique ID if not provided
    global pdf_id
    pdf_id = str(uuid.uuid4())

    reader = PdfReader(pdf_bytes)
    documents = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        documents.append(Document(page_content=text, metadata={"page": i+1 , "pdf_id":pdf_id}))
    return documents

def upload_to_qdrant(documents, collection_name):
    
    # text splitting is handled by the vector store
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = splitter.split_documents(documents)


    # upload documents to Qdrant
    qdrant = Qdrant(
    client=qdrant_client,
    collection_name=collection_name,
    embeddings=ollama_embeddings,   
    )
    qdrant.add_documents(docs)

  

def extract_questions(documents):
    all_questions = []

    # Regex to find Q1., 1., etc.
    question_pattern = re.compile(
        r'(?i)(Q\d+\.|^\d+\.)\s*(.*?)(?=(Q\d+\.|^\d+\.)|$)',  
        re.DOTALL | re.MULTILINE
    )

    for doc in documents:
        text = doc.page_content
        matches = question_pattern.findall(text)

        for match in matches:
            question_text = match[1].strip()

            # Look for marks in format like [2], [5 Marks], etc.
            marks_match = re.search(r'\[(\d+)\s*(marks?)?\]', question_text, re.IGNORECASE)
            marks = int(marks_match.group(1)) if marks_match else None

            all_questions.append({
                "question": question_text,
                "marks": marks,
                "page": doc.metadata["page"],
                "pdf_id": doc.metadata["pdf_id"]
            })

    return all_questions

In [15]:
with open("Ans.pdf", "rb") as f:
    Referal = f.read()


# Referal_PDF_to_Qdrant( io.BytesIO(Referal), collection_name)

In [16]:


with open("Java.pdf", "rb") as f:
    QuePdf = f.read()

# Ques = qus_loading(io.BytesIO(QuePdf))
# print(Ques)

In [17]:
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system",
    """
You are a helpful academic assistant. You will receive a question, a number of marks (if provided), and answers from two sources:

1. Retrieval-based answer from PDF/document search (called 'retrieval_answer').
2. Web search answer from Tavily (called 'tavily_answer').

Your task is to produce a single, structured, and complete answer based primarily on the retrieval_answer. Use the tavily_answer only to supplement or fill gaps if the retrieval_answer is insufficient or unclear.

Instructions:
- If marks are not provided, assume 5 marks.
- Adjust the depth of the answer based on marks:
  - 1–2 marks: Short and precise answer (2–4 lines).
  - 3–5 marks: Medium-length answer; include example or code if relevant.
  - More than 5 marks: Detailed explanation with code, diagrams (text), and a conclusion.
- If the question is about programming, include a relevant **code example**.
- Combine the two answers intelligently: prioritize retrieval_answer (~80%) and tavily_answer (~20%), but omit tavily_answer if retrieval_answer is complete.

### Input variables:
- question: The question asked.
- marks: Number of marks for the answer.
- retrieval_answer: Answer from the retrieval tool.
- tavily_answer: Answer from the Tavily web search.

### Format your answer as follows:
---
**Question:** {question}

**Marks:** {marks}

**Retrieval-based Answer:**  
{retrieval_answer}

**Tavily Answer:**  
{tavily_answer}

**Combined Answer:**
- Introduction (if needed)
- Diagram (if needed)
- Key Points (in bullets)
- Explanation
- Code Example (if applicable)
- Conclusion/Summary
---
""")
])






# create retrival function
qdrant = Qdrant(
    client=qdrant_client,
    collection_name=collection_name,
    embeddings=ollama_embeddings,   
)
retriver = qdrant.as_retriever()



# ------------------------- use to check it is tool call or not----------------------------------

# Wrap rag_call
rag_tool = Tool.from_function(
    func=retriver.invoke,
    name="RAG",
    description="Use this tool to answer academic questions from PDF content.",
    return_direct=False  # Set to True only if you want the tool result to be final output
)

# create taivy search function
tavily_search = TavilySearchResults()
# Wrap tavily_call
tavily_tool = Tool.from_function(
    func=tavily_search.invoke,
    name="TavilySearch",
    description="Use this tool to answer general or web-based questions using Tavily search.",
    return_direct=False
)

# Bind tools to LLM
tools = [rag_tool, tavily_tool]
llm_with_tools = Groq_llm.bind_tools(tools)

# llm_chain_with_tools = ollama_model.bind_tools(tools)
llm_chain = prompt | ollama_llm




In [18]:
# Create graph
import sys
from typing_extensions import TypedDict
from typing import Annotated
from langchain_core.messages import AnyMessage
from langgraph.graph.message import add_messages
from langgraph.graph import StateGraph, START, END
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from io import BytesIO
from langchain.tools import tool
import io
import markdown
from xhtml2pdf import pisa

import os, sys
# Go one directory up to reach project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

# Set up Django
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "PDF_QA.settings")
import django
django.setup()
from chat_bot.email_tasks import send_email_task






In [19]:
class State(TypedDict):
    messages: Annotated[list[AnyMessage], add_messages]
    marks:int



# create a run_llm_with_tools function
def run_llm_with_tools(state: State):
    return  {"messages":state["messages"] + [llm_with_tools.invoke(state["messages"])]}


def ToolExecutor(state: State):
    query_content = state["messages"][-2].content
    

    ans = []

    try:
        rag = rag_tool.invoke(query_content)
       
        # Convert Document list into text
        rag_content = "\n\n".join([doc.page_content for doc in rag])
        ans.append({"role": "assistant", "content": rag_content})
    except Exception as e:
        print(f"RAG Tool Error: {e}")

    try:
        tavily = tavily_tool.invoke(query_content) 
        ans.append({"role": "assistant", "content": tavily})
    except Exception as e:
        print(f"Tavily Tool Error: {e}")
        

    return {"messages": state["messages"] + ans}


    

def format_answer(state: State):
    question = state["messages"][0].content
    marks = state.get("marks")
    retrieval_answer = state["messages"][-2].content
    tavily_answer = state["messages"][-1].content

    llm_input = {
        "question": question,
        "marks": marks,
        "retrieval_answer": retrieval_answer,
        "tavily_answer": tavily_answer,
    }
    result = llm_chain.invoke(llm_input)
    return {"messages":state["messages"] +[result]}



Builder = StateGraph(State)

# Create Nodes
Builder.add_node("format_answer",format_answer)
Builder.add_node("LLM_with_tools", run_llm_with_tools)
Builder.add_node("Tools",ToolExecutor)


# add edges
Builder.add_edge(START, "LLM_with_tools")
Builder.add_edge("LLM_with_tools", "Tools")
Builder.add_edge("Tools", "format_answer")
Builder.add_edge("format_answer", END)

# Create the graph
graph = Builder.compile()

# Function to run the graph
# def run_graph(question: str, marks: int = 5):
#     initial_state = {
#         "messages": [
#             {"role": "user","content": question}
#         ],
#         "marks": marks 
#     }
#     return graph.invoke(initial_state)
# # Example usage
# if __name__ == "__main__":
#     question = "What is the difference between JDK, JRE, and JVM? Explain with examples."
#     marks = 5
#     result = run_graph(question, marks)
#     print(result["messages"][-1].content)  

In [22]:

from vercel_blob import put
import vercelpy.blob_store as blob_store

class StateGraphExecutor(TypedDict):
    messages: Annotated[list[AnyMessage], add_messages]
    Ans:Annotated[list[AnyMessage], add_messages]
    Referal:bytes
    QuePdf:bytes
    collection_name:str
    FinalPdf: str



def srart_graph(state:StateGraphExecutor):

    my_que=state["messages"][-1].content
    all_answers = []
    for i,v in enumerate(my_que):
        
        initial_state = {
            "messages": [
                {"role": "user","content": v.get("question")}
            ],
            "marks":str( v.get("marks") )
        }
        result = graph.invoke(initial_state)
        final_msg = result["messages"][-1].content
        all_answers.append({"role":"assistant", "content": final_msg})
    return {"Ans": state["Ans"] + all_answers} 

# --------- PDF generation (sync) ---------

def call_pdf_genrater(state):
    answers = state["Ans"]
    questions = state["messages"][0].content

    # Create full markdown content
    markdown_content = "# 📝 Question-Answer Report\n\n"
    for i, item in enumerate(answers):
        q_text = questions[i].get("question")
        a_text = item.content if hasattr(item, 'content') else item
        markdown_content += f"### Q{i+1}: {q_text}\n"
        markdown_content += f"**A{i+1}:**\n{a_text}\n\n"

    # Convert Markdown to HTML
    html = markdown.markdown(markdown_content)

    # Generate PDF from HTML
    buffer = io.BytesIO()
    pisa.CreatePDF(io.StringIO(html), dest=buffer)
    buffer.seek(0)
    pdf_bytes = buffer.getvalue()

    # # Optional: Save locally for testing
    # with open("report.pdf", "wb") as f:
    #     f.write(pdf_bytes)

    # ✅ Upload PDF bytes directly (no need to reopen file)
    import vercelpy.blob_store as blob_store
    resp = blob_store.put(
        "PDF_Q&A/report_session.pdf",
        pdf_bytes,  # use the bytes in memory
        {
            "contentType": "application/pdf",
            "access": "public",
            "allowOverwrite": True
        }
    )

    return {**state, "FinalPdf": resp["url"]}




# --------- Async wrapper node for PDF ---------
# def call_pdf_genrater(state):
#     answers = state["Ans"]
#     que = state["messages"][0].content
#     input_que = []

#     for i,item in enumerate(answers):
#         print(f"Que: {que[i].get("question")}")
#         print(f"Ans: {item}")
       

def Referal_PDF_to_Qdrant(state:StateGraphExecutor):
    file = state["Referal"]
    collection_name = state["collection_name"]
    print(collection_name)
    # Load the PDF file
    documents = load_pdf(file)
    
    # Upload documents to the collection
    upload_to_qdrant(documents, collection_name)

def qus_loading(state:StateGraphExecutor):
    file = state["QuePdf"]
    # lead the PDF file
    documents = load_pdf(file)
    all_Ques = extract_questions(documents)
    ans = {
        "role":"assistant",
        "content":all_Ques
    }
    return {"messages":state["messages"] + [ans]}

def send_mail(state:StateGraphExecutor):
    pdf = state["FinalPdf"]
    send_email_task.apply_async(args=[pdf]) 

main_builder = StateGraph(StateGraphExecutor)
# Create Node
main_builder.add_node("Referal_PDF_to_Qdrant", Referal_PDF_to_Qdrant)
main_builder.add_node("qus_loading", qus_loading)
main_builder.add_node("graph",srart_graph)
main_builder.add_node("pdf",call_pdf_genrater)
main_builder.add_node("send_mail",send_mail)
# add edges
# main_builder.add_edge(START, "Referal_PDF_to_Qdrant")
main_builder.add_edge(START, "qus_loading")
main_builder.add_edge("qus_loading","graph")
main_builder.add_edge("graph","pdf")
main_builder.add_edge("pdf","send_mail")
main_builder.add_edge("send_mail",END)

main_graph = main_builder.compile()
global collection_name
input_grapg={
    "Referal":io.BytesIO(Referal),
    
    "QuePdf":io.BytesIO(QuePdf),
    "collection_name":collection_name,

    "Ans": []
}

main_graph.invoke(input_grapg)

OperationalError: Connection closed by server.