In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]
#expected outputs
outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "C:/Azhar/FullStackRAG/data/goldens.csv"
df.to_csv(csv_path, index=False)

In [4]:
from langsmith import Client

client = Client()
dataset_name = "AgenticAIReportGoldens"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['ba388c80-9466-49f1-b138-9f67063600ee',
  'f73d82fe-b975-4b7d-80ac-5af86ecc2926',
  '7ee75d2e-b5e5-43cd-a1b6-f7855960bfd5'],
 'count': 3}

In [5]:
import sys
sys.path.append("C:/Azhar/FullStackRAG")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "C:/Azhar/FullStackRAG/data/the 2025 ai engineering report.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-10-22T18:04:45.585899Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-22T18:04:45.587898Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-22T18:04:45.590899Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_06...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-22T18:04:45.592970Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-22T18:04:45.599513Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251022_233445_520630bf", "temp_dir": "data\\session_20251022_233445_520630bf", "faiss_dir": "faiss_index\\session_20251022_233445_520630bf", "sessionized": true, "timestamp": "2025-10-22T18:04:45.610131Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "the 2025 ai engineering report.txt", "saved_

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: For customer-facing applications, OpenAI models dominate the field. 3 out of the top 5 and half of the top 10 most popular models are from OpenAI.


In [7]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

In [8]:
# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

Testing all questions from the dataset:



{"timestamp": "2025-10-22T18:07:10.510016Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-22T18:07:10.512029Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-22T18:07:10.514050Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_06...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-22T18:07:10.516051Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-22T18:07:10.522871Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251022_233710_9122f8bc", "temp_dir": "data\\session_20251022_233710_9122f8bc", "faiss_dir": "faiss_index\\session_20251022_233710_9122f8bc", "sessionized": true, "timestamp": "2025-10-22T18:07:10.527855Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "the 2025 ai engineering report.txt", "saved_

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: For customer-facing applications, OpenAI models dominate the field. 3 out of the top 5 and half of the top 10 most popular models are from OpenAI.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-22T18:07:23.356033Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-22T18:07:23.357969Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-22T18:07:23.360488Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_06...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-22T18:07:23.363874Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-22T18:07:23.373283Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251022_233723_fa5d603e", "temp_dir": "data\\session_20251022_233723_fa5d603e", "faiss_dir": "faiss_index\\session_20251022_233723_fa5d603e", "sessionized": true, "timestamp": "2025-10-22T18:07:23.380672Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "the 2025 ai engineering report.txt", "saved_

Q2: What percentage of respondents are using RAG in some form?
A2: 70% of respondents are using RAG in some form.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-22T18:07:29.260701Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-22T18:07:29.268932Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-22T18:07:29.269446Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_06...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-22T18:07:29.269446Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-22T18:07:29.269446Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251022_233729_f5acca0e", "temp_dir": "data\\session_20251022_233729_f5acca0e", "faiss_dir": "faiss_index\\session_20251022_233729_f5acca0e", "sessionized": true, "timestamp": "2025-10-22T18:07:29.269446Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "the 2025 ai engineering report.txt", "saved_

Q3: How often are most respondents updating their models?
A3: More than 50% of respondents are updating their models at least monthly, with 17% doing so weekly.

--------------------------------------------------------------------------------



### Custom Correctness Evaluator
Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment

In [10]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

### Run Evaluation with Custom Correctness Evaluator

In [11]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "AgenticAIReportGoldens"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

View the evaluation results for experiment: 'agenticAIReport-correctness-eval-f70616fe' at:
https://smith.langchain.com/o/480755c5-3c85-4a32-a13e-d152c5945c38/datasets/876f9558-de26-4c25-bb75-209b26f4a8e9/compare?selectedSessions=9df8cb5f-b8bf-4fd9-92c0-1243f0230ed8




0it [00:00, ?it/s]{"timestamp": "2025-10-22T18:20:14.865835Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-22T18:20:14.866861Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-22T18:20:14.867861Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_06...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-22T18:20:14.868863Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-22T18:20:14.873170Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251022_235014_25af6c96", "temp_dir": "data\\session_20251022_235014_25af6c96", "faiss_dir": "faiss_index\\session_20251022_235014_25af6c96", "sessionized": true, "timestamp": "2025-10-22T18:20:14.875184Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "the 2025 ai engineering re


Evaluation completed! Check the LangSmith UI for detailed results.
