In [1]:
!pip install ragas

Collecting ragas
  Downloading ragas-0.2.15-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-0.3.16-py3-none-any.whl.metadata (2.3 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community->ragas)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community->ragas)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community->ragas)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasse

In [3]:
!pip install openai faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


create RAG system

In [6]:
import os

import faiss
import openai
import numpy as np
#from dotenv import load_dotenv

#load_dotenv()

In [20]:
from google.colab import userdata


In [8]:
docs = [
    "Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.",
    "Jane Austen was an English novelist best known for 'Pride and Prejudice' and 'Sense and Sensibility'.",
    "The Great Wall of China is a series of fortifications built to protect the ancient Chinese states.",
    "Mount Everest, part of the Himalayas, is Earth’s highest mountain above sea level.",
    "Mike loves the color pink more than any other color."
]

In [10]:
import os
import openai

# Get your API key from the userdata
api_key = userdata.get('OPENAI_API_KEY')
# and set the environment variable
os.environ['OPENAI_API_KEY'] = api_key

# Now initialize the client
client = openai.OpenAI()

In [11]:
def get_embedding(text):
    response = client.embeddings.create(model="text-embedding-3-small", input=text)
    return response.data[0].embedding

In [12]:
embeddings = np.array([get_embedding(d) for d in docs]).astype('float32')
index = faiss.IndexFlatIP(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [13]:
def retrieve(query, k):
    query_embedding = np.array([get_embedding(query)]).astype("float32")

    faiss.normalize_L2(query_embedding)
    _, idx = index.search(query_embedding, k)

    return [docs[i] for i in idx[0]]


def generate_answer(question, contexts):
    prompt = (
        "Answer the user question **only** with facts found in the context.\n\n"
        "Context:\n"
        + "\n".join(f"- {c}" for c in contexts)
        + f"\n\nQuestion: {question}\nAnswer:"
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )

    return response.choices[0].message.content.strip()

##EVALUATE RAG system with Ragas

In [14]:
from datasets import Dataset

questions = [
    "What is the capital of France?",
    "Who wrote Pride and Prejudice?",
    "Where is Mount Everest located?",
    "What is Mike's favorite color?"
]

ground_truths = [
    "Paris",
    "Jane Austen",
    "the Himalayas",
    "Pink"
]

rows = []

for question, ground_truth in zip(questions, ground_truths):
    context = retrieve(question, k=2)
    answer = generate_answer(question, context)
    rows.append(
        {
            "question": question,
            "contexts": context,
            "answer": answer,
            "reference": ground_truth,
        }
    )

evaluation_dataset = Dataset.from_list(rows)


In [15]:
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall,
)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

[{'question': 'What is the capital of France?', 'contexts': ['Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.', 'Mike loves the color pink more than any other color.'], 'answer': 'Paris is the capital of France.', 'reference': 'Paris'}, {'question': 'Who wrote Pride and Prejudice?', 'contexts': ["Jane Austen was an English novelist best known for 'Pride and Prejudice' and 'Sense and Sensibility'.", 'Mike loves the color pink more than any other color.'], 'answer': "Jane Austen wrote 'Pride and Prejudice'.", 'reference': 'Jane Austen'}, {'question': 'Where is Mount Everest located?', 'contexts': ['Mount Everest, part of the Himalayas, is Earth’s highest mountain above sea level.', 'Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.'], 'answer': 'Mount Everest is located in the Himalayas.', 'reference': 'the Himalayas'}, {'question': "What is Mike's favorite color?", 'contexts': ['Mike loves the co

High Score

In [16]:
rows = []

context = docs[-1:]
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": answer,
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Mike loves the color pink more than any other color.'], 'response': "Mike's favorite color is pink.", 'reference': 'Pink'}]
{'answer_correctness': 0.9645, 'answer_relevancy': 1.0000, 'faithfulness': 1.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}


Wrong Context

In [17]:
rows = []

context = ['Vienna is the capital of Austria']
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": answer,
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Vienna is the capital of Austria'], 'response': "The context does not provide information about Mike's favorite color.", 'reference': 'Pink'}]
{'answer_correctness': 0.1968, 'answer_relevancy': 0.0000, 'faithfulness': 1.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}


Correct Answer with wrong context

In [18]:
rows = []

context = ['Vienna is the capital of Austria']
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": "Mike's favorite color is pink!",
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Vienna is the capital of Austria'], 'response': "Mike's favorite color is pink!", 'reference': 'Pink'}]
{'answer_correctness': 0.9644, 'answer_relevancy': 1.0000, 'faithfulness': 0.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}


Ollama integration

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

from langchain_ollama.chat_models import ChatOllama
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [None]:
llm = ChatOllama(model="qwen3:4b", temperature=0)
ragas_llm = LangchainLLMWrapper(llm)

emb = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5")
ragas_emb = LangchainEmbeddingsWrapper(emb)

scores = evaluate(
    evaluation_dataset,
    metrics=[answer_correctness, answer_relevancy, faithfulness,
             context_precision, context_recall],
    llm=ragas_llm,
    embeddings=ragas_emb,
)

In [None]:
print(scores)