In [18]:
!pip install weave openai



In [20]:
import weave

weave.init("rag-baselining")

Logged in as Weights & Biases user: aianytime07.
View Weave data at https://wandb.ai/aianytime07/rag-baselining/weave


<weave.trace.weave_client.WeaveClient at 0x782b5b81e380>

In [21]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [22]:
from openai import OpenAI

client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Please translate English into Hindi."},
        {"role": "user", "content": "AI is a old technology."},
    ],
    temperature=0,
)


generation = response.choices[0].message.content
print(generation)

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223a3-59c5-7ba1-a23f-31f8f3123e78
एआई एक पुरानी तकनीक है।


Recording functions - @weave.op()

Append @weave.op() to a function to record the input and output of the function.


(1) Add @weave.op
to the function. In this example, add @weave.op to the translation() function.

In [23]:
@weave.op()
def translation(user_input):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Please translate English into Hindi."},
            {"role": "user", "content": user_input},
        ],
        temperature=0,
    )
    return response.choices[0].message.content

result = translation("AI is a old technology")
print(result)

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223a4-e344-7762-85af-dce613c1ec0a
एआई एक पुरानी तकनीक है।


Recording the model - weave.Model

(1) Specify weave.Model as the class inheritance base  .

In this example, we have the class TranslationModel inherit weave.Model .

In [24]:
from openai import OpenAI

class TranslationModel(weave.Model):
    system_instruction: str

    @weave.op()
    def translation(self, user_input):
        client = OpenAI()
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": self.system_instruction},
                {"role": "user", "content": user_input},
            ],
            temperature=0,
        )
        return response.choices[0].message.content


model = TranslationModel(
    system_instruction="Please translate English into Hindi.",
)
result = model.translation("Generative AI has been hyped up a lot.")
print(result)

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223a6-2ea1-79d3-9806-f99f120d2287
जनरेटिव एआई को बहुत अधिक प्रचारित किया गया है।


Accuracy verification procedure

In [27]:
documents = [
    "SpaceX is a private aerospace manufacturer and space transportation company founded by Elon Musk. It has revolutionized the space industry with reusable rockets and aims to enable human life on Mars.",
    "NASA is the United States government agency responsible for space exploration and research. It has led historic missions like the Apollo moon landings and is currently involved in developing technologies for future space missions.",
    "Tesla is a leading electric vehicle and clean energy company that focuses on accelerating the transition to sustainable energy. Its innovations include electric cars, solar panels, and energy storage solutions.",
    "Blue Origin is an aerospace company founded by Jeff Bezos, aiming to make space travel more affordable and accessible. It is known for its suborbital space tourism program and development of reusable rockets.",
    "Boeing is a major American corporation in the aerospace industry, manufacturing commercial airplanes, defense systems, and satellites. Boeing has a long history of contributions to aviation and space exploration.",
    "Virgin Galactic is a space tourism company founded by Richard Branson. It focuses on providing commercial suborbital spaceflights for passengers, aiming to make space travel accessible to more people."
]

In [28]:
from openai import OpenAI


def docs_to_embeddings(docs: list) -> list:
    openai = OpenAI()
    document_embeddings = []
    for doc in docs:
        response = (
            openai.embeddings.create(
                input=doc,
                model="text-embedding-3-small"
            )
            .data[0]
            .embedding
        )
        document_embeddings.append(response)
    return document_embeddings


docs_embeddings = docs_to_embeddings(documents)

In [29]:
import numpy as np


@weave.op()
def get_most_relevant_document(query):

    openai = OpenAI()
    query_embedding = (
        openai.embeddings.create(
            input=query,
            model="text-embedding-3-small"
        )
        .data[0]
        .embedding
    )


    similarities = [
        np.dot(query_embedding, doc_emb)
        / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
        for doc_emb in docs_embeddings
    ]


    most_relevant_doc_index = np.argmax(similarities)
    return documents[most_relevant_doc_index]

We define a model in “Weave” to collect the information necessary for evaluating the RAG system.

・Attribute : model_name

・Function : predict()

　Specify "question" as input and "answer" and "context" as output

In [30]:
class RAGModel(weave.Model):
    model_name: str = "gpt-4o"


    @weave.op()
    def predict(self, question: str) -> dict:

        context = get_most_relevant_document(question)


        client = OpenAI()
        query = f"""Please answer the questions using only the following context.
If you don't know the answer, answer 'I don't know.'


context:"
```
{context}
```

question:
{question}"""
        response = client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "user", "content": query},
            ],
            temperature=0.0,
            response_format={"type": "text"},
        )
        answer = response.choices[0].message.content
        return {"answer": answer, "context": context}

In [31]:
model = RAGModel(
    model_name="gpt-4o"
)
model.predict("What is NASA?")

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223a9-68b6-70c0-a581-85488f287370


{'answer': 'NASA is the United States government agency responsible for space exploration and research.',
 'context': 'NASA is the United States government agency responsible for space exploration and research. It has led historic missions like the Apollo moon landings and is currently involved in developing technologies for future space missions.'}

In [32]:
questions = [
    {"question": "Who founded SpaceX?"},
    {"question": "Which U.S. government agency is responsible for space exploration?"},
    {"question": "What company is focused on accelerating the transition to sustainable energy?"},
    {"question": "Which company aims to make space travel more affordable and accessible?"},
    {"question": "Which major American corporation is involved in both aviation and space exploration?"},
    {"question": "What company is focused on commercial space tourism?"}
]


Preparing the evaluation function.
This time, we will ask the LLM to evaluate whether the context was helpful in arriving at a given answer , following the example of " Faithfulness ," one of the evaluation indicators in the " RAGAS " framework for RAG evaluation.

In [33]:
import json

@weave.op()
async def context_precision_score(question, model_output):
    context_precision_prompt = """Given the question, answer, and context, verify whether the context helped in reaching the provided answer.
In the JSON output, if the context was useful, output {{verification: 1}}.
If the context was not useful, output {{verification: 0}}.
Output in valid JSON format only.

Context:
```
{context}
```

Answer:
{answer}
"""

    client = OpenAI()
    prompt = context_precision_prompt.format(
        question=question,
        context=model_output["context"],
        answer=model_output["answer"],
    )
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={ "type": "json_object" }
    )
    response_message = response.choices[0].message
    response = json.loads(response_message.content)
    return {
        "verdict": int(response["verification"]) == 1,
    }

In [34]:

import nest_asyncio
nest_asyncio.apply()

In [35]:
# weave.Evaluation
evaluation = weave.Evaluation(
    dataset=questions,
    scorers=[context_precision_score]
)

In [39]:

model = RAGModel(
    model_name="gpt-4o"
)
await evaluation.evaluate(model)

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223af-f8b0-71c0-9678-a8a1e7cd056f


{'context_precision_score': {'verdict': {'true_count': 6,
   'true_fraction': 1.0}},
 'model_latency': {'mean': 2.4371865590413413}}

In [38]:
model = RAGModel(
    model_name="gpt-4o-mini"
)
await evaluation.evaluate(model)

🍩 https://wandb.ai/aianytime07/rag-baselining/r/call/019223af-078b-7b02-92aa-4c3a9b885d3b


{'context_precision_score': {'verdict': {'true_count': 6,
   'true_fraction': 1.0}},
 'model_latency': {'mean': 3.0771077076594033}}