# Ragas evaluation of Conversations with Literal Doc Chat

In [34]:
%%capture

# Install the local Python client.
!pip install .

## Setup connection to Literal

Specify whether to connect to the production environment or locally.

In [1]:
import os

from literalai import LiteralClient

env_configs = {
    "prod": {
        "key": "",
        "url": "https://cloud.getliteral.ai"
    },
    "local": {
        "key": "my-initial-api-key",
        "url": "http://localhost:3000"
    }
}

def set_key_url(key: str, url: str):
    os.environ["LITERAL_API_KEY"] = key
    os.environ["LITERAL_API_URL"] = url

set_key_url(**env_configs["local"])

literal_client = LiteralClient()

## Create a RAG prompt template

In [3]:
template_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that always answers questions. Keep it short, and if available prefer responding with code."
    },
    {
        "role": "user",
        "content": "Answer the question based on the context below.\nContext:\n{{#context}}\n{{.}}\n{{/context}}\nQuestion:\n{{question}}\nAnswer:"
    }
]

# Has to align with the prompt name your RAG application uses.
PROMPT_NAME = "RAG prompt"

prompt = literal_client.api.create_prompt(name=PROMPT_NAME, template_messages=template_messages)

## Launch RAG Literal Doc application

Ask a few questions, like: 
- What is a dataset in Literal?
- How can I add a Step to an existing Dataset in Python?
- How would I go about removing an item from a dataset in TS?

Create a new chat to trigger new threads.

Then visualize the threads and steps from the UI.

## Create an empty Dataset

In [7]:
dataset = literal_client.api.create_dataset(name="Literal documentation RAG")

## Add "Query" Steps to Dataset

In [8]:
threads = literal_client.api.get_threads(first=1).data

query_steps = []
for thread in threads:
    thread_query_steps = [step for step in thread.steps if step.name == "Query"]
    query_steps.extend(thread_query_steps)

for step in query_steps:
    dataset.add_step(step.id)

# Prepare Ragas data samples

In [10]:
import json

from literalai import DatasetItem
from typing import List

# For each query in our dataset, build contexts as Ragas expects them
contexts: List[List[str]] = []

items = [DatasetItem.from_dict(item_dict) for item_dict in dataset.items]
for item in items:
    # Find the first "Retrieve" step in the intermediary steps.
    retrieve_step = next(step for step in item.intermediary_steps if step['name'] == "Retrieve")

    # Return all contexts in that step's output.
    matches = json.loads(retrieve_step['expectedOutput']['content'])["metadatas"][0]
    contexts.append([match["text"] for match in matches])

# Data samples as expected by Ragas
data_samples = {
    'question': [json.loads(item.input["content"])["args"][0] for item in items],
    'answer': [ item.expected_output["content"] for item in items ],
    'contexts': contexts,
    'ground_truth': [""]* len(items) # No need for ground truths for: faithfulness (claims ratio), answer relevance (potential questions), context relevancy, aspect critique
}

## Evaluate with Ragas

We will evaluate context relevancy which checks how relevant the retrieved contexts are to answer the user's question. 

The more unneeded details in the contexts, the less relevant (between 0 and 1).

In [23]:
from datasets import Dataset

from ragas.metrics import context_relevancy
from ragas import evaluate

metrics = [context_relevancy]

results = evaluate(Dataset.from_dict(data_samples), metrics=metrics).to_pandas()

Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.12it/s]


In [24]:
results.head()

Unnamed: 0,question,answer,contexts,ground_truth,context_relevancy
0,How can I add a Step to an existing Dataset in...,To add a Step to an existing Dataset in Python...,[title:Create and Populate a Dataset_descripti...,,0.001984
1,What is a dataset in Literal?,A dataset in Literal is a collection of items ...,[title:Overview_description:None_content: Lit...,,0.064103
2,How would I go about removing an item from a d...,To remove an item from a dataset in TypeScript...,[title:Create and Populate a Dataset_descripti...,,0.0


## Create Literal experiment and log results

In [25]:
from literalai import ScoreDict

experiment = dataset.create_experiment(name="Ragas - Context relevancy In Prod (#chunks = 5)", prompt_id=prompt.id, assertions={"type": "context relevancy"})

# Log each experiment result.
for index, row in results.iterrows():
    scores = [  
        { 
            "name": metric.name,
            "type": "AI",
            "value": row[metric.name]
        } 
        for metric in metrics
    ]

    experiment_item = {
        "datasetItemId": items[index].id,
        "scores": scores,
        "input": { "content": row["question"] },
        "output": { "content": row["contexts"] }
    }
    
    experiment.log(experiment_item)


## Change number of retrieved contexts and evaluate context relevancy

### Call to Chroma DB to get the necessary contexts

Make parameter changes to your vector database and retrieved new contexts from the questions. 

Here we'll simply get the top 2 contexts instead of top 5.

In [26]:
data_samples = {
    'question': [json.loads(item.input["content"])["args"][0] for item in items],
    'answer': [ item.expected_output["content"] for item in items ],
    'contexts': [x[:2] for x in contexts], # Select the top 2 contexts
    'ground_truth': [""]* len(items)
}

results = evaluate(Dataset.from_dict(data_samples), metrics=metrics).to_pandas()

Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.34s/it]


### Log experiment results

In [27]:
experiment = dataset.create_experiment(name="Ragas - Context relevancy (#chunks = 2)", prompt_id=prompt.id, assertions={"type": "context relevancy"})

# Log each experiment result.
for index, row in results.iterrows():
    scores = [  
        { 
            "name": metric.name,
            "type": "AI",
            "value": row[metric.name]
        } 
        for metric in metrics
    ]

    experiment_item = {
        "datasetItemId": items[index].id,
        "scores": scores,
        "input": { "content": row["question"] },
        "output": { "content": row["contexts"] }
    }
    
    experiment.log(experiment_item)