In [None]:
print("hello world")

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader # <-- The key change is this import
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# --- PDF LOADING AND PROCESSING using PyMuPDF ---

# 1. Define the path to your folder of PDFs
pdf_folder_path = "./data_stored/"
print(f"Loading PDFs from: {pdf_folder_path}")

# 2. List all the PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]

# 3. Load all the documents from the PDF files
all_documents = []
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_folder_path, pdf_file)
    print(f"  - Loading document: {pdf_file}")
    # Use PyMuPDFLoader instead of PyPDFLoader
    loader = PyMuPDFLoader(file_path)
    # The loader splits the PDF into pages, each page is a Document
    pages = loader.load()

    start_index = 50  # Keep pages from page 22 onwards
    end_index = 52   # Keep pages before page 172

    cleaned_pages = [
        page for page in pages 
        if start_index <= page.metadata.get('page', 0) < end_index
    ]

    all_documents.extend(pages)

print(f"Loaded a total of {len(all_documents)} pages from {len(pdf_files)} PDF files.")


In [None]:

# 4. Split the loaded documents into smaller chunks (This part is unchanged)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=200
)
doc_splits = text_splitter.split_documents(all_documents)

print(f"Split the documents into {len(doc_splits)} chunks.")

In [None]:
# for r in doc_splits:
#     print(r)  # Print the first 200 characters of each chunk
#     print("--------------------------------")

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

In [None]:
### LLM

repo_id = "meta-llama/Llama-3.1-8B-Instruct"     # answer llm
graph_llm_name = "gemini-2.5-flash" 
embed_model_name = "all-MiniLM-L6-v2"

In [None]:
# embedding model

from langchain_huggingface import HuggingFaceEmbeddings

# model_name="all-MiniLM-L6-v2"
# model_name="sentence-transformers/all-mpnet-base-v2"

embedding_model = HuggingFaceEmbeddings(model_name=embed_model_name)

In [None]:
from langchain_milvus import Milvus



# Add to Milvus
vectorstore = Milvus.from_documents(
    documents=doc_splits,
    collection_name="rag_milvus_test_3",
    embedding=embedding_model,
    connection_args={"uri": "./milvus_ingest_test_3.db"},
)
retriever = vectorstore.as_retriever()

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

graph_llm = ChatGoogleGenerativeAI(model=graph_llm_name, temperature=0)



In [None]:
# # GraphRAG Setup
# from langchain_community.graphs import Neo4jGraph
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_core.documents import Document
# from langchain_experimental.llms.ollama_functions import OllamaFunctions
# from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
# from langchain_openai import ChatOpenAI
# from langchain_ollama import ChatOllama

# graph = Neo4jGraph()


# graph_transformer = LLMGraphTransformer(
#     llm=graph_llm,
# )
# print("1")

# graph_documents = graph_transformer.convert_to_graph_documents(doc_splits)
# print("2")
# graph.add_graph_documents(graph_documents)
# print("3")
# print(f"Graph documents: {len(graph_documents)}")
# print(f"Nodes from 1st graph doc:{graph_documents[0].nodes}")
# print(f"Relationships from 1st graph doc:{graph_documents[0].relationships}")

In [None]:
# # After converting to graph documents
# for i, doc in enumerate(graph_documents):
#     print(f"Document {i}:")
#     print(f"  Nodes: {doc.nodes}")
#     print(f"  Relationships: {doc.relationships}")
#     print("---")

In [None]:
# GraphRAG Setup
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

import time

graph = Neo4jGraph()


graph_transformer = LLMGraphTransformer(
    llm=graph_llm,
)
print("1")

graph_documents = []
for doc in doc_splits:
    stime = time.time()
    # Process one document at a time
    graph_doc = graph_transformer.convert_to_graph_documents([doc])
    graph_documents.extend(graph_doc)
    print(f"end of chunk, time taken: {time.time() - stime}")
    
print("2")
graph.add_graph_documents(graph_documents)
print("3")
print(f"Graph documents: {len(graph_documents)}")
print(f"Nodes from 1st graph doc:{graph_documents[0].nodes}")
print(f"Relationships from 1st graph doc:{graph_documents[0].relationships}")

In [None]:
# # After converting to graph documents
# for i, doc in enumerate(graph_documents):
#     print(f"Document {i}:")
#     print(f"  Nodes: {doc.nodes}")
#     print(f"  Relationships: {doc.relationships}")
#     print("---")

In [None]:


llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", temperature=0)



In [None]:
### Retrieval Grader

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser


prompt = PromptTemplate(
    template="""You are a grader assessing relevance 
    of a retrieved document to a user question. If the document contains keywords related to the user question, 
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. 
    
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.
     
    Here is the retrieved document: 
    {document}
    
    Here is the user question: 
    {question}
    """,
    input_variables=["question", "document"],
)

# prompt = PromptTemplate(
#     template="""You are a grader assessing relevance
# of a retrieved document to a user question. If the document contains keywords related to the user question,
# grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals.

# Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
# Provide the binary score as a JSON with a single key 'score' and no premable or explaination.

# **IMPORTANT:** Your response MUST be only the JSON object itself, without any surrounding text or markdown.

# Here is the retrieved document:
# {document}

# Here is the user question:
# {question}
# """,
#     input_variables=["question", "document"],
# )


retrieval_grader = prompt | llm | JsonOutputParser()
question = "What is a graph?"
docs = retriever.invoke(question)
print("num of docs", len(docs))
doc_txt = docs[1].page_content
print(doc_txt)
print(
    f'Is our answer relevant to the question asked: {retrieval_grader.invoke({"question": question, "document": doc_txt})}'
)

In [None]:
### Generate

from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    Also only answer the question based on the context provided. If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise:
    Question: {question} 
    Context: {context} 
    Answer: 
    """,
    input_variables=["question", "document"],
)



def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = prompt | llm | StrOutputParser()

question = "What are graphrag?"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)