# Ragas evaluation of Conversations with Literal Doc Chat

In [1]:
%%capture

# Install the local Python client, for dev purposes only. Ultimately remove from notebook.
!pip install ../../


In [1]:
data_samples = {'question': ["What's my name?"],
 'answer': ['Your name is John.'],
 'contexts': [['My name is John.', 'My job is coding.']],
 'ground_truth': ['']}

# Do our Dataset integrate easily with HF's Dataset?
from datasets import Dataset

from ragas.metrics import context_relevancy
from ragas import evaluate

metrics = [context_relevancy]
ds = Dataset.from_dict(data_samples)

results = evaluate(ds, metrics=metrics)

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

## Run a RAG application

In [1]:
import os
import chromadb

from openai import OpenAI
from dotenv import load_dotenv
from literalai import LiteralClient

load_dotenv()

openai_client = OpenAI()

literal_client = LiteralClient()

# literal_client.instrument_openai()

chroma_client = chromadb.PersistentClient()


collection = chroma_client.get_or_create_collection("Biography")
collection.add(
    documents=["My name is John.", "My job is coding."],
    ids=["id1", "id2"]
)

template_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that always answers questions. Keep it short, and if available prefer responding with code."
    },
    {
        "role": "user",
        "content": "Answer the question based on the context below.\nContext:\n{{#context}}\n{{.}}\n{{/context}}\nQuestion:\n{{question}}\nAnswer:"
    }
]

PROMPT_NAME = "RAG prompt"

prompt = literal_client.api.create_prompt(name=PROMPT_NAME, template_messages=template_messages)

@literal_client.step(type="run", name="Retrieval Augmented Generation")
def rag(user_query: str):

    with literal_client.step(type="run", name="Retrieve") as step:
        step.input = { "content": user_query }
        res = collection.query(query_texts=[user_query], n_results=2)
        step.output = { "content": res }

    with literal_client.step(type="run", name="LLM") as step:
        step.input = { "content": res['documents'][0] }
        messages = prompt.format({"context": res['documents'][0], "question": user_query})
        
        completion = openai_client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                )
        step.output = { "content": completion.choices[0].message.content }

    return completion.choices[0].message.content
        
def main():
    question = "What's my name?"
    with literal_client.thread(name="Example") as thread:
        literal_client.message(content=question, type="user_message", name="User")
        answer = rag(question)
        literal_client.message(content=answer, type="assistant_message", name="My Assistant")

main()
# Network requests by the SDK are performed asynchronously.
# Invoke flush_and_stop() to guarantee the completion of all requests prior to the process termination.
# WARNING: If you run a continuous server, you should not use this method.
literal_client.flush()


Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id1
Add of existin

In [4]:
import os
import chromadb

from openai import OpenAI
from dotenv import load_dotenv
from literalai import LiteralClient

load_dotenv()

openai_client = OpenAI()

literal_client = LiteralClient()

literal_client.instrument_openai()


def main():
    question = "What's my name?"
    with literal_client.thread(name="Example") as thread:
        literal_client.message(content="coudou", type="assistant_message", name="My Assistant")

main()
# Network requests by the SDK are performed asynchronously.
# Invoke flush_and_stop() to guarantee the completion of all requests prior to the process termination.
# WARNING: If you run a continuous server, you should not use this method.
# literal_client.flush_and_stop()


In [6]:
data_samples = {'question': ["What's my name?"],
 'answer': ['Your name is John.'],
 'contexts': [['My name is John.', 'My job is coding.']],
 'ground_truth': ['']}

# Do our Dataset integrate easily with HF's Dataset?
from datasets import Dataset

from ragas.metrics import context_relevancy, faithfulness
from ragas import evaluate

metrics = [context_relevancy]
ds = Dataset.from_dict(data_samples)

results = evaluate(ds, metrics=metrics, is_async=False)

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Create an empty Dataset

In [8]:
DATASET_NAME = f"Literal documentation RAG 523"

dataset = literal_client.api.get_dataset(name=DATASET_NAME)
if not dataset:
    dataset = literal_client.api.create_dataset(name=DATASET_NAME)

## Add "Query" Steps to Dataset

In [9]:
threads = literal_client.api.get_threads(first=1).data

query_steps = []
for thread in threads:
    thread_query_steps = [step for step in thread.steps if step.name == "Retrieval Augmented Generation"]
    query_steps.extend(thread_query_steps)

for step in query_steps:
    dataset.add_step(step.id)

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,faithfulness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,1.0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",0.0


# Prepare Ragas data samples

In [19]:
import json

from literalai import DatasetItem
from typing import List

# For each query in our dataset, build contexts as Ragas expects them
contexts: List[List[str]] = []

items = dataset.items

for item in items:
    retrieve_step = next(step for step in item.intermediary_steps if step['name'] == "Retrieve")
    # Remove unnecessary layers (content), ideally only "expectedOutput"
    contexts.append(retrieve_step["expectedOutput"]['content']["documents"][0])

# Data samples as expected by Ragas
data_samples = {
    'question': [item.input["args"][0] for item in items],
    'answer': [ item.expected_output["content"] for item in items ],
    'contexts': contexts,
    'ground_truth': [""]* len(items) # No need for ground truths for: faithfulness (claims ratio), answer relevance (potential questions), context relevancy, aspect critique
}

In [20]:
data_samples

{'question': ["What's my name?"],
 'answer': ['Your name is John.'],
 'contexts': [['My name is John.', 'My job is coding.']],
 'ground_truth': ['']}

In [16]:
from datasets import Dataset 
from ragas.metrics import faithfulness
from ragas import evaluate

ds = Dataset.from_dict(data_samples)
score = evaluate(ds,metrics=[faithfulness])
score.to_pandas()


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,faithfulness
0,When was the first super bowl?,"The first superbowl was held on Jan 15, 1967",[The First AFL–NFL World Championship Game was...,1.0
1,Who won the most super bowls?,The most super bowls have been won by The New ...,"[The Green Bay Packers...Green Bay, Wisconsin....",0.0


## Evaluate with Ragas

We will evaluate context relevancy which checks how relevant the retrieved contexts are to answer the user's question. 

The more unneeded details in the contexts, the less relevant (between 0 and 1).

In [7]:
os.environ["OPENAI_API_KEY"]

'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'

In [23]:
data_samples = {'question': ["What's my name?"],
 'answer': ['Your name is John.'],
 'contexts': [['My name is John.', 'My job is coding.']],
 'ground_truth': ['']}

# Do our Dataset integrate easily with HF's Dataset?
from datasets import Dataset

from ragas.metrics import context_relevancy
from ragas import evaluate

metrics = [context_relevancy]
ds = Dataset.from_dict(data_samples)

results = evaluate(ds, metrics=metrics)

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
results.head()

Unnamed: 0,question,answer,contexts,ground_truth,context_relevancy
0,Hello,Hello! How can I assist you today?,"[title:""Evaluation""_description:None_content: ...",,0.003185


## Create Literal experiment and log results

In [11]:
from literalai import ScoreDict

experiment = dataset.create_experiment(name="Ragas - Context relevancy In Prod (#chunks = 5)", prompt_id=prompt.id, assertions={"type": "context relevancy"})

# Log each experiment result.
for index, row in results.iterrows():
    scores = [  
        { 
            "name": metric.name,
            "type": "AI",
            "value": row[metric.name]
        } 
        for metric in metrics
    ]

    experiment_item = {
        "datasetItemId": items[index].id,
        "scores": scores,
        "input": { "content": row["question"] },
        "output": { "content": row["answer"] }
    }
    
    experiment.log(experiment_item)


## Change number of retrieved contexts and evaluate context relevancy

### Call to Chroma DB to get the necessary contexts

Make parameter changes to your vector database and retrieved new contexts from the questions. 

Here we'll simply get the top 2 contexts instead of top 5.

In [12]:
data_samples = {
    'question': [json.loads(item.input["content"])["args"][0] for item in items],
    'answer': [ item.expected_output["content"] for item in items ],
    'contexts': [x[:2] for x in contexts], # Select the top 2 contexts
    'ground_truth': [""]* len(items)
}

results = evaluate(Dataset.from_dict(data_samples), metrics=metrics).to_pandas()

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

### Log experiment results

In [13]:
experiment = dataset.create_experiment(name="Ragas - Context relevancy (#chunks = 2)", prompt_id=prompt.id, assertions={"type": "context relevancy"})

# Log each experiment result.
for index, row in results.iterrows():
    scores = [  
        { 
            "name": metric.name,
            "type": "AI",
            "value": row[metric.name]
        } 
        for metric in metrics
    ]

    experiment_item = {
        "datasetItemId": items[index].id,
        "scores": scores,
        "input": { "content": row["question"] },
        "output": { "content": row["answer"] }
    }
    
    experiment.log(experiment_item)