In [1]:
import asyncio
import os

import nest_asyncio
import weave
from dotenv import load_dotenv
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.llms.bedrock_converse import BedrockConverse
from llama_index.llms.openai import OpenAI
from weave import Model
import pandas as pd

In [2]:
nest_asyncio.apply()

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# initiating weave project
weave.init("broker-bot-eval-dev")

Logged in as Weights & Biases user: bo-brandt.
View Weave data at https://wandb.ai/forgeglobal/broker-bot-eval-dev/weave


<weave.trace.weave_client.WeaveClient at 0x172949e50>

### Loading FAQ data for chat context

In [3]:
# Load FAQ data
raw_data = weave.ref(
    "weave:///forgeglobal/broker-bot-eval-dev/object/raw_faq_data:z5vEhANfgeDJJHS6HZecNDYhHSNDebPmsxr7lmgafZc"
).get()

# store raw data as Document object
documents = [
    Document(text=dict(data)["content"], metadata=dict(data)["metadata"])
    for data in raw_data.rows[:]
]

# create the pipeline with transformations
CHUNK_SIZE = 300
CHUNK_OVERLAP = 0
chunk_pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP),
        OpenAIEmbedding(),
    ]
)

# run the pipeline
nodes = chunk_pipeline.run(documents=documents)

# create index
index = VectorStoreIndex(nodes)

system_prompt = "You are a chat bot. Only provide answers based on FAQ's related to Forge Global. Do not use your general knowledge.",
memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

### Evaluation functions

In [4]:
# Evaluation functions
@weave.op()
def response_to_evaluate(question: str, model_name: str, model_provider: str) -> dict:

    if model_provider == "openai":
        llm = OpenAI(model=model_name, temperature=0)
    else:
        llm = BedrockConverse(model=model_name, temperature=0)

    chat_engine = index.as_chat_engine(
        llm=llm,
        chat_mode="context",
        memory=memory,
        system_prompt=system_prompt,
        verbose=False,
    )

    response = chat_engine.chat(question)

    contexts = [node.get_content() for node in response.source_nodes]

    return {
        "question": question,
        "answer": response.response,
        "contexts": contexts,
        "response": response,
    }


@weave.op()
def relevancy_scorer(
    question: str, model_output: dict, eval_model_name: str, eval_model_provider: str
) -> dict:

    if eval_model_provider == "openai":
        llm = OpenAI(model=eval_model_name, temperature=0)
    else:
        llm = BedrockConverse(model=eval_model_name, temperature=0)

    eval_result = RelevancyEvaluator(llm=llm).evaluate_response(
        query=question, response=model_output["response"]
    )
    return {
        "question": question,
        "feedback": eval_result.feedback,
        "passing": eval_result.passing,
    }


@weave.op()
async def faithfulness_scorer(
    question: str, model_output: dict, eval_model_name: str, eval_model_provider: str
) -> dict:
    if eval_model_provider == "openai":
        llm = OpenAI(model=eval_model_name, temperature=0)
    else:
        llm = BedrockConverse(model=eval_model_name, temperature=0)
    eval_result = FaithfulnessEvaluator(llm=llm).evaluate_response(
        response=model_output["response"]
    )
    return {
        "question": question,
        "feedback": eval_result.feedback,
        "passing": eval_result.passing,
    }

### Load questions to evaluate - 100 random sample

In [5]:
# Load Extract Questions
dfq = pd.read_csv("data/questions-full-extracted-2024.csv")

# Define the fraction to sample (e.g., 10%)
sample_fraction = 0.02

# Sample 10% of the rows randomly
sampled_df = dfq.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)

print(sampled_df.shape)
sampled_df.groupby("SIDE")["question"].count()

(112, 6)


SIDE
Buyer               3
Buyer or Seller    92
Seller             17
Name: question, dtype: int64

### Generate input dataset for evaluation

In [9]:


# Generate Question Format
question_input_openai = []
question_input_claude = []


for idx, row in sampled_df.iterrows():
    # Building question evaluation dictionary using OpenAI as LLM
    dict_openai = {
        "question": row["question"],
        "model_name": "gpt-4o-mini",
        "model_provider": "openai",
        "eval_model_name": "anthropic.claude-3-haiku-20240307-v1:0",
        "eval_model_provider": "bedrock",
    }
    question_input_openai.append(dict_openai)

    # Building question evaluation dictionary using Claude as LLM
    dict_claude = {
        "question": row["question"],
        "model_name": "anthropic.claude-3-haiku-20240307-v1:0",
        "model_provider": "bedrock",
        "eval_model_name": "gpt-4o-mini",
        "eval_model_provider": "openai",
    }
    question_input_claude.append(dict_claude)
    
    # if idx ==1:
    #     break

### Run evaluation

In [10]:
# Run evaluation
openai_evaluation = weave.Evaluation(
    name="openai-eval",
    description="Evaluating OpenAI (gpt-4o-mini) repsonses with Claude (claude-haiku)",
    dataset=question_input_openai,
    scorers=[relevancy_scorer, faithfulness_scorer],
)

asyncio.run(openai_evaluation.evaluate(response_to_evaluate))

{'relevancy_scorer': {'passing': {'true_count': 104,
   'true_fraction': 0.9285714285714286}},
 'faithfulness_scorer': {'passing': {'true_count': 104,
   'true_fraction': 0.9285714285714286}},
 'model_latency': {'mean': 17.483537895338877}}

In [12]:
# Run evaluation
claude_evaluation = weave.Evaluation(
    name="claude-eval",
    description="Evaluating Claude (claude-haiku) responses with OpenAI (gpt-4o-mini)",
    dataset=question_input_claude,
    scorers=[relevancy_scorer, faithfulness_scorer],
)

asyncio.run(claude_evaluation.evaluate(response_to_evaluate))

{'relevancy_scorer': {'passing': {'true_count': 94,
   'true_fraction': 0.8392857142857143}},
 'faithfulness_scorer': {'passing': {'true_count': 35,
   'true_fraction': 0.3125}},
 'model_latency': {'mean': 24.369585269263812}}