In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## Exporting Examples in langsmith

In [4]:
import pandas as pd

# QA

inputs = [
    "What problem does MemR3 aim to solve in LLM memory systems?",
    "What are the two core mechanisms of MemR3?",
    "How does MemR3 differ from the standard retrieve-then-answer pipeline?",
    "What is the role of the evidence–gap tracker in MemR3?",
    "What actions can the MemR3 router choose from?",
    "Why is MemR3 considered plug-and-play?",
    "On which benchmark was MemR3 evaluated?",
    "What metrics were used to evaluate MemR3?",
    "How much does MemR3 improve RAG and Zep on GPT-4.1-mini?",
    "Which question types benefit the most from MemR3?",
    "Why does Full-Context sometimes perform worse than MemR3?",
    "What backend memory systems were used with MemR3?",
]

outputs = [
    "MemR3 addresses the lack of closed-loop, explicit control in memory retrieval for LLM agents, reducing noisy, inefficient, and incomplete retrieval.",
    "MemR3 uses a router that selects retrieve, reflect, or answer actions, and a global evidence–gap tracker that tracks known evidence and missing information.",
    "Instead of a single retrieval pass, MemR3 uses a closed-loop process with iterative retrieval, reflection, and early stopping based on evidence completeness.",
    "The evidence–gap tracker explicitly records what has been established as evidence and what information is still missing to answer the query correctly.",
    "The router can choose among retrieve, reflect, and answer actions at each iteration.",
    "MemR3 acts as an external controller and can be integrated with any existing memory backend that returns text snippets, without changing storage architecture.",
    "MemR3 was evaluated on the LoCoMo benchmark for long-term conversational memory.",
    "Answer quality was measured using LLM-as-a-Judge scores, evaluated by GPT-4.1.",
    "With GPT-4.1-mini, MemR3 improves RAG by +7.29% and Zep by +1.94% overall.",
    "Temporal and multi-hop questions benefit the most from MemR3 due to explicit gap tracking and iterative retrieval.",
    "Full-Context often overloads the LLM with irrelevant or noisy memories, which hurts reasoning, especially for temporal and open-domain queries.",
    "MemR3 was tested with chunk-based RAG and graph-based Zep memory backends.",
]

data = [{'question':q,'output':o} for q,o in zip(inputs,outputs)]
df = pd.DataFrame(data)

df.to_csv('/home/ayush/Documents/AI/AyushDataScience/9_GenAI/llmops/end_to_end_rag_project/data/docs.csv')

In [5]:
from langsmith import Client

client = Client()
dataset_name = "AgenticAImemoryqa"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIResearch",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['d330d1bb-d6aa-4f45-80a1-e555944e794f',
  '94088886-bf08-44f8-85a7-798e99580545',
  '8f0e89fd-88e9-413a-a298-1eeeaf26d401',
  '27400071-3ad2-4173-912e-a3ff1d8adcfc',
  '58d67563-fb1a-4a14-a322-85856a73a1ea',
  '219de6ce-10d4-4357-aa4b-32157e919549',
  '944bf6fc-cbd6-4ce2-a640-aa65240d439e',
  'a2eba90c-f210-4fed-86a8-50289d8e3edc',
  'bf44a6db-4b96-4537-b507-9f73185741d0',
  'a18e123b-7863-45b7-8ef0-487b58ce2942',
  'ce24e6d2-99e2-4762-b8f3-5f46cac00419',
  '81bcebff-dcfc-4a71-97ca-385fd8ff9804'],
 'count': 12}

# getting the rag


In [3]:
import sys
sys.path.append("/home/ayush/Documents/AI/AyushDataScience/9_GenAI/llmops/end_to_end_rag_project")

from pathlib import Path
from multi_doc_chat.src.DataIngestion import ChatIngestor
from multi_doc_chat.src.retreival import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/home/ayush/Documents/AI/AyushDataScience/9_GenAI/llmops/end_to_end_rag_project/data/doc.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI agent memory Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [7]:
# testing the rag
test_input = {'question':'What are the two core mechanisms of MemR3?'}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-12-27T14:10:26.978215Z", "level": "info", "event": "Config Yaml loaded : ['embedding_model', 'retriever', 'llm']"}
{"session_id": "session_20251227_194026_99e5741d", "temp_dir": "data/session_20251227_194026_99e5741d", "faiss_dir": "faiss_index/session_20251227_194026_99e5741d", "sessionized": true, "timestamp": "2025-12-27T14:10:26.979899Z", "level": "info", "event": "ChatIngestor initialized"}
{"original_filename": "doc.txt", "saved_as": "data/session_20251227_194026_99e5741d/doc_f7f62f14.txt", "timestamp": "2025-12-27T14:10:26.982979Z", "level": "info", "event": "File saved successfully"}
{"count": 1, "timestamp": "2025-12-27T14:10:26.984315Z", "level": "info", "event": "Documents loaded"}
{"chunks": 82, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-12-27T14:10:26.988389Z", "level": "info", "event": "Documents split"}
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1.1 200 OK"
HTTP Re

Question: What are the two core mechanisms of MemR3?

Answer: The two core mechanisms of MemR3 are the explicit maintenance and updating of evidence (E) and the gap (G), which summarize what the agent knows and still needs to know. Additionally, MemR3 uses a router to choose actions like retrieve, reflect, or answer, transforming the process into a closed-loop controller. This allows it to refine retrieval queries, integrate new evidence, and stop early when the information gap is resolved.


# Evaluating

## Builtin evaluator 

In [4]:
from langsmith import Client
from openevals.llm import create_llm_as_judge
from openevals.prompts import CONCISENESS_PROMPT  # better than conciseness for RAG

client = Client()

qa_evaluator = create_llm_as_judge(
    prompt=CONCISENESS_PROMPT,
    feedback_key="qa_correctness",
    model="gemini:model/gemini-2.5-flash-lite",
)

experiment_results = client.evaluate(
    answer_ai_report_question,   # ← target MUST be positional
    "AgenticAImemoryqa",          # ← data MUST be positional
    evaluators=[qa_evaluator],
    experiment_prefix="test-agenticAIReport-qa-rag",
    metadata={
        "variant": "RAG with FAISS + agent memory",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)


  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'test-agenticAIReport-qa-rag-7194569d' at:
https://smith.langchain.com/o/b6ae6d41-8930-4df1-adfd-5dea23b1b78b/datasets/c2205ae9-db43-4acb-b93f-8b2430046d72/compare?selectedSessions=96118258-6556-400e-898c-da432800fba6




0it [00:00, ?it/s]{"timestamp": "2025-12-27T15:46:31.072492Z", "level": "info", "event": "Config Yaml loaded : ['embedding_model', 'retriever', 'llm']"}
{"session_id": "session_20251227_211631_d380f1f8", "temp_dir": "data/session_20251227_211631_d380f1f8", "faiss_dir": "faiss_index/session_20251227_211631_d380f1f8", "sessionized": true, "timestamp": "2025-12-27T15:46:31.081161Z", "level": "info", "event": "ChatIngestor initialized"}
{"original_filename": "doc.txt", "saved_as": "data/session_20251227_211631_d380f1f8/doc_2d249a07.txt", "timestamp": "2025-12-27T15:46:31.084725Z", "level": "info", "event": "File saved successfully"}
{"count": 1, "timestamp": "2025-12-27T15:46:31.086990Z", "level": "info", "event": "Documents loaded"}
{"chunks": 82, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-12-27T15:46:31.100132Z", "level": "info", "event": "Documents split"}
HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:batchEmbedContents "HTTP/1

KeyboardInterrupt: 

## Custom Evaluator

In [None]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

        Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

        - If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
        - If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

        Do not penalize for stylistic or formatting differences unless they change meaning."""),
                ("human", """<example>
        <input>
        {input}
        </input>

        <output>
        Expected Output: {expected_output}

        Actual Output: {actual_output}
        </output>
        </example>

        Please grade the following agent run given the input, expected output, and actual output.
        Focus only on correctness (semantic and factual alignment).

        Respond with:
        1. A brief reasoning (1-2 sentences)
        2. A final verdict: either "CORRECT" or "INCORRECT"

        Format your response as:
        Reasoning: [your reasoning]
        Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }


In [None]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "AgenticAIReportGoldens"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")


## Combo of both 

# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
   qa_evaluator,  # built in evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )