In [43]:
import pandas as pd
from openai import AsyncOpenAI
from ragas import experiment,EvaluationDataset
from datasets import Dataset
from ragas.llms import llm_factory
from ragas.metrics import DiscreteMetric
import os



In [45]:
samples = [
    {
        "user_input": "What is Ragas 0.3?", 
        "reference": "Ragas 0.3 is a library for evaluating LLM applications.",
        "response": "Ragas 0.3 is a tool used to evaluate LLM apps." # Should Pass
    },
    {
        "user_input": "How to install Ragas?", 
        "reference": "install with pip using ragas[examples]",
        "response": "You can install it using npm install ." # Should Fail (wrong package manager)
    },
    {
        "user_input": "What are the main features of Ragas?", 
        "reference": "organised around - experiments - datasets - metrics.",
        "response": "It features experiments, datasets, and evaluation metrics." # Should Pass
    }
]
df = pd.DataFrame(samples)
dataset = EvaluationDataset.from_pandas(df)


In [46]:
client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",
    api_key="ollama" 
)

llm = llm_factory("llama3.2:3b", provider="openai", client=client)

In [47]:
my_metric_1 = DiscreteMetric(
    name="correctness",
    prompt="""
    Check if the response contains the key points mentioned in the grading notes.
    
    Grading Notes: {grading_notes}
    Response: {response}
    
    You must output your response in JSON format with the following keys:
    1. "reason": A brief explanation of why the response passes or fails.
    2. "value": Either "pass" or "fail".
    
    JSON Output:
    """,
    allowed_values=["pass", "fail"],
)
my_metric_2 = DiscreteMetric(
    name="correctness",
    prompt="""
    Check if the response accurately reflects the information in the grading notes.
    
    Grading Notes: {grading_notes}
    Response: {response}
    
    Instructions:
    - Focus on the SEMANTIC meaning. 
    - Synonyms (e.g., 'dashboard' instead of 'panel') should be accepted as 'pass'.
    - Output JSON with "reason" and "value" ('pass' or 'fail').
    """,
    allowed_values=["pass", "fail"],
)
my_metric_3 = DiscreteMetric(
    name="correctness",
    prompt="""
    Role: You are a strict semantic evaluator.
    Task: Compare the 'Response' against the 'Grading Notes'.
    
    Grading Logic:
    - PASS if the meaning is the same, even if different words are used (e.g., synonyms).
    - FAIL if there is a technical contradiction, incorrect command, or missing core fact.

    Example A (Semantic Match):
    Grading Notes: "The software is compatible with macOS."
    Response: "It runs on Apple computers."
    Result: {{"reason": "Apple computers run macOS, so the meaning is preserved.", "value": "pass"}}

    Example B (Technical Error):
    Grading Notes: "Use the 'git push' command to upload code."
    Response: "Run 'git pull' to send your changes."
    Result: {{"reason": "The response suggests 'pull' (download) instead of 'push' (upload).", "value": "fail"}}

    Current Task:
    Grading Notes: {grading_notes}
    Response: {response}

    Return the result in valid JSON format.
    """,
    allowed_values=["pass", "fail"],
)


In [48]:
# jupyter.debugJustMyCode: make it unselected for enabling going to base files for debug
# It is in "ctrl + , " , "jupyter.debugJustMyCode" 
from  ragas.backends import LocalCSVBackend # to give ragas a way to save the results
@experiment(
    name_prefix="ragas_discrete_metric_prompt_3_test",
    backend= LocalCSVBackend(root_dir=".")
)
async def run_rag_evaluation(row):
    # access the specific 'response' and 'reference' defined above
    # it will directly gives our prompt in my_metric to llm with grading notes and response 
    score = await my_metric_3.ascore(
        llm=llm,
        response=row.response,
        grading_notes=row.reference
    )

    return {
        "user_input": row.user_input,
        "response": row.response,
        "score": score.value,
        "reason": score.reason,
        "reference": row.reference
    }
results = await run_rag_evaluation.arun(dataset=dataset)
for result in results:
    print(result,"\n")

Running experiment: 100%|██████████| 3/3 [00:13<00:00,  4.38s/it]

{'user_input': 'What is Ragas 0.3?', 'response': 'Ragas 0.3 is a tool used to evaluate LLM apps.', 'score': 'pass', 'reason': 'Although both responses mention Ragas 0.3, they use different terminology (library vs. tool), which may lead to confusion. However, the core meaning of evaluating LLM applications remains preserved.', 'reference': 'Ragas 0.3 is a library for evaluating LLM applications.'} 

{'user_input': 'How to install Ragas?', 'response': 'You can install it using npm install .', 'score': 'pass', 'reason': "npm install is not a standard way to install Python packages, but 'install' and 'pip' are synonyms. The meaning of the response is preserved.", 'reference': 'install with pip using ragas[examples]'} 

{'user_input': 'What are the main features of Ragas?', 'response': 'It features experiments, datasets, and evaluation metrics.', 'score': 'fail', 'reason': "The response does not mention 'evaluation' as a core metric, which is an essential part of grading notes. However, it 




In [None]:

# Ragas core imports
from ragas import experiment, EvaluationDataset
from ragas.llms import llm_factory
from ragas.embeddings import LlamaIndexEmbeddingsWrapper # Or Langchain wrapper
from ragas.metrics import AnswerCorrectness, SemanticSimilarity

# Langchain HuggingFace (The actual implementation)
from langchain_huggingface import HuggingFaceEmbeddings


# 1. Initialize LLM (Ollama - Llama 3.2)
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
llm = llm_factory("llama3.2:3b", provider="openai", client=client)

# 2. Initialize Embeddings correctly
# This uses the sentence-transformers library under the hood
lc_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai
  from ragas.metrics import AnswerCorrectness, SemanticSimilarity
  from ragas.metrics import AnswerCorrectness, SemanticSimilarity


In [2]:
# 3. Define Numeric Metrics
# Pass the langchain embeddings directly or wrap them if needed
correctness_metric = AnswerCorrectness(llm=llm, embeddings=lc_embeddings)
similarity_metric = SemanticSimilarity(embeddings=lc_embeddings)

# 4. Create your test dataset
samples = [
    {
        "user_input": "What is Ragas 0.3?", 
        "reference": "Ragas 0.3 is a library for evaluating LLM applications.",
        "response": "Ragas 0.3 is a tool used to evaluate LLM apps."
    },
    {
        "user_input": "How to install Ragas?", 
        "reference": "install from source - install from pip using ragas[examples]",
        "response": "You can install it using npm install ragas."
    }
]
dataset = EvaluationDataset.from_pandas(pd.DataFrame(samples))



In [11]:
# 1. Corrected Imports
import pandas as pd
import asyncio
import nest_asyncio
from openai import AsyncOpenAI
from ragas import experiment, EvaluationDataset
from ragas.llms import llm_factory
from ragas.embeddings import LangchainEmbeddingsWrapper
# Note: SemanticSimilarity and AnswerCorrectness are in collections, 
# but AnswerSimilarity might not be exported there in your version.
from ragas.metrics.collections import AnswerCorrectness, SemanticSimilarity
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.backends import LocalCSVBackend
from ragas.dataset_schema import SingleTurnSample

nest_asyncio.apply()

# 2. Initialize Embeddings with Wrapper
lc_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
ragas_embeddings = LangchainEmbeddingsWrapper(lc_embeddings)

# 3. Define Metrics
# Pass the wrapped embeddings directly to AnswerCorrectness. 
# It will internally handle the similarity calculation.
correctness_metric = AnswerCorrectness(
    llm=llm, 
    embeddings=ragas_embeddings
)
similarity_metric = SemanticSimilarity(embeddings=ragas_embeddings)

# 5. Define the Experiment
@experiment()
async def run_numeric_evaluation(row):
    sample = SingleTurnSample(
        user_input=row.user_input,
        response=row.response,
        reference=row.reference
    )
    
    # Initialize metrics for the custom experiment loop
    await correctness_metric.init()
    await similarity_metric.init()
    
    score_correctness = await correctness_metric.single_turn_ascore(sample)
    score_similarity = await similarity_metric.single_turn_ascore(sample)

    return {
        "user_input": row.user_input,
        "response": row.response,
        "correctness_score": score_correctness,
        "similarity_score": score_similarity,
        "reference": row.reference
    }

# 6. Run the evaluation
async def main():
    results = await run_numeric_evaluation.arun(
        dataset=dataset, 
        name="numeric_test_run",
        backend=LocalCSVBackend(root_dir=".")
    )
    
    print("\n--- Evaluation Results ---")
    df_results = results.to_pandas()
    print(df_results.head())

await main()

  ragas_embeddings = LangchainEmbeddingsWrapper(lc_embeddings)


ValueError: Collections metrics only support modern embeddings. Found: LangchainEmbeddingsWrapper. Use: embedding_factory('openai', model='text-embedding-ada-002', client=openai_client, interface='modern')