In [1]:
import chromadb

chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection("Biography")
collection.add(
    documents=["My name is John.", "My job is coding.", "My dog's name is Fido. Fido is an expert fetcher."],
    ids=["id1", "id2", "id3"]
)

In [2]:
import os

from openai import OpenAI
from dotenv import load_dotenv
from literalai import LiteralClient

load_dotenv()

openai_client = OpenAI()

literal_client = LiteralClient()
literal_client.instrument_openai()

In [3]:
PROMPT_NAME = "RAG prompt"
template_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that always answers questions. Keep it short, and if available prefer responding with code."
    },
    {
        "role": "user",
        "content": "Answer the question based on the context below.\nContext:\n{{#context}}\n{{.}}\n{{/context}}\nQuestion:\n{{question}}\nAnswer:"
    }
]

prompt = literal_client.api.create_prompt(name=PROMPT_NAME, template_messages=template_messages)

In [4]:
@literal_client.step(type="run", name="RAG")
def rag(user_query: str):
    with literal_client.step(type="retrieval", name="Retrieve") as step:
        step.input = { "question": user_query }
        results = collection.query(query_texts=[user_query], n_results=2)
        # time.sleep(1)
        step.output = results

    messages = prompt.format({"context": results["documents"][0], "question": user_query})
    
    return "Something"
        
def main(cnt):
    questions = [ "What's my name?", "What's my job?" ]
    for idx, question in enumerate(questions[:1]):
        with literal_client.thread(name=f"Question {cnt} - {idx+1}") as thread:
            literal_client.message(content=question, type="user_message", name="User")
            answer = rag(question)
            time.sleep(1)
            literal_client.message(content=answer, type="assistant_message", name="My Assistant")

import time

start = time.time()
cnt = 0
while cnt < 10:
    main(cnt)
    cnt += 1

end = time.time()

print(f"Time elapsed {end-start}")
# Network requests by the SDK are performed asynchronously.
# Invoke flush() to guarantee the completion of all requests prior to the process termination.
# WARNING: If you run a continuous server, you should not use this method.
literal_client.flush()

Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to upsert thread: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to upsert thread: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connection refused
Failed to send steps: [Errno 111] Connec

10.299527883529663


#### Prepare Ragas data samples

In [7]:
import json

from literalai import DatasetItem
from typing import List

items = dataset.items

# Get the retrieved contexts for each question.
contexts = []
for item in items:
    retrieve_step = next(step for step in item.intermediary_steps if step["name"] == "Retrieve")
    contexts.append(retrieve_step["expectedOutput"]["documents"][0])

# Data samples, in the format expected by Ragas. No ground truth needed since we will evaluate context relevancy.
data_samples = {
    'question': [item.input["args"][0] for item in items],
    'answer': [item.expected_output["content"] for item in items],
    'contexts': contexts,
    'ground_truth': [""]* len(items)
}

#### Run the evaluation

We will evaluate context relevancy which checks how relevant the retrieved contexts are to answer the user's question. 

The more unneeded details in the contexts, the less relevant (between 0 and 1, 0 being least relevant).

In [8]:
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import context_relevancy

results = evaluate(Dataset.from_dict(data_samples), metrics=[context_relevancy]).to_pandas()

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

### Persist experiment to Literal

In [9]:
experiment = dataset.create_experiment(
    name="Biography - Experiment A",
    prompt_id=prompt.id,
    params=[{ "type": context_relevancy.name, "top_k": 2 }]
)

# Log each experiment result.
for index, row in results.iterrows():
    scores = [{ 
        "name": context_relevancy.name,
        "type": "AI",
        "value": row[context_relevancy.name]
    }]

    experiment_item = {
        "datasetItemId": items[index].id,
        "scores": scores,
        "input": { "question": row["question"] },
        "output": { "contexts": row["contexts"].tolist() }
    }
    
    experiment.log(experiment_item)