In [1]:
import sys

print(sys.executable)

/Users/cam/Projects/.venv/bin/python


In [1]:
import logging
from dotenv import load_dotenv
import os
from datetime import datetime
import editdistance
from arize.experimental.datasets.core.client import ArizeDatasetsClient
from arize.experimental.datasets.experiments.evaluators.base import (
    EvaluationResult,
    Evaluator,
)

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Configuration
arize_api_key = os.getenv("ARIZE_API_KEY")
arize_developer_key = os.getenv("ARIZE_DEVELOPER_KEY")
space_id = os.getenv("ARIZE_SPACE_ID")
dataset_id = "RGF0YXNldDoxMzk2OmR5S3A="

# Initialize Arize client
client = ArizeDatasetsClient(developer_key=arize_developer_key, api_key=arize_api_key)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = client.get_dataset(space_id=space_id, dataset_id=dataset_id)
print(f"Dataset fetched with {len(dataset)} rows")
display(dataset.head())  # Using display() for better notebook formatting


# Define evaluator class
class NotebookArizeEvaluator(Evaluator):
    def evaluate(self, *, output: dict, dataset_row: dict, **_) -> EvaluationResult:
        try:
            # Check if we have an error
            if output.get("error"):
                return EvaluationResult(
                    explanation=f"Error during processing: {output['error']}",
                    score=0.0,
                    label="ERROR",
                )

            model_output = output.get("output", "")
            expected = output.get("expected_response", "")  # Get from output dictionary

            # Check if we actually have both outputs to compare
            if not model_output or not expected:
                return EvaluationResult(
                    explanation="Missing model output or expected response",
                    score=0.0,
                    label="ERROR",
                )

            # Calculate edit distance
            distance = editdistance.eval(str(model_output), str(expected))
            max_possible_distance = max(len(str(model_output)), len(str(expected)))
            normalized_score = 1 - (
                distance / max_possible_distance if max_possible_distance > 0 else 0
            )

            # Determine label based on score
            if normalized_score > 0.9:
                label = "EXCELLENT"
            elif normalized_score > 0.7:
                label = "GOOD"
            elif normalized_score > 0.5:
                label = "FAIR"
            else:
                label = "POOR"

            return EvaluationResult(
                explanation=f"Edit distance: {distance}. Normalized score: {normalized_score:.2f}",
                score=normalized_score,
                label=label,
            )

        except Exception as e:
            return EvaluationResult(
                explanation=f"Evaluation error: {str(e)}", score=0.0, label="ERROR"
            )


# Define task function anthropic.claude-3-5-sonnet-20241022-v2:0
def notebook_task(dataset_row: dict) -> dict:
    try:
        query = dataset_row.get("attributes.input.value", "")
        print(f"\nQuery found: {query}")

        if not query or not isinstance(query, str):
            print(f"Invalid query format detected: {type(query)}")
            raise ValueError(f"Invalid query format: {query}")

        # Import necessary components
        import boto3
        from tasks import IndexManager, QueryClassifier

        # Initialize Bedrock client
        bedrock_client = boto3.client("bedrock-runtime")

        # Create new index manager and query engine
        index_manager = IndexManager(bedrock_client=bedrock_client)
        query_engine = index_manager.get_query_engine()

        # Initialize classifier with different model
        different_model = "anthropic.claude-3-sonnet-20240229-v1:0"  # or whatever model you want to use anthropic.claude-3-haiku-20240307-v1:0
        classifier = QueryClassifier(
            query_engine=query_engine,
            bedrock_client=bedrock_client,
            model=different_model,
        )

        # Process the query using the classifier
        category, confidence = classifier.classify_query(query)
        response = classifier.get_response(query, category)

        expected_response = dataset_row.get("attributes.output.value", "")

        return {
            "output": response.response,
            "error": None,
            "query": query,
            "expected_response": expected_response,
            "model_used": different_model,
            "category": category.value,
            "confidence": confidence,
        }
    except Exception as e:
        error_msg = f"Error processing query: {str(e)}"
        print(error_msg)
        return {
            "output": None,
            "error": error_msg,
            "query": query if "query" in locals() else None,
            "expected_response": dataset_row.get("attributes.output.value", ""),
            "model_used": different_model if "different_model" in locals() else None,
        }

Dataset fetched with 4 rows


Unnamed: 0,attributes.session_id,attributes.llm.prompt_template.version,attributes.classification.confidence,attributes.reranker.top_k,attributes.llm.token_count.prompt,attributes.tool.input,attributes.llm.output_messages,attributes.tool.selection.matched_type,eval.contains_any_keyword.explanation,attributes.output.value,...,id,created_at,updated_at,source_record_datasource,source_record_model_environment,source_record_timestamp,source_record_span_id,source_record_trace_id,events,attributes.llm.prompt_template.variables.query
0,,,0.9,,,,,,,"According to OSHA regulations, personal protec...",...,1ccb51c8-fdc3-4da3-b8f5-c976fd0a9ba9,1733188698905,1733188698905,17a0b3c2-2673-4a41-bf74-5212970b65bf,5,1732929553417,431c692148f7651d,98d9f6ef831db5a7eddefc3230126b7a,,
1,,,0.8,,,,,,,There are no specific OSHA regulations that re...,...,975bfe47-8c8e-4a86-a0d0-e7ae6a6b7306,1733258753664,1733258753664,17a0b3c2-2673-4a41-bf74-5212970b65bf,5,1733249056262,6947713226d5e74c,a0be0107e62a339dc3c526cc7f3689f1,,
2,,,0.9,,,,,,,PPE stands for Personal Protective Equipment. ...,...,1a88f276-7825-436b-9716-1ccfb611d160,1733258753664,1733258753664,17a0b3c2-2673-4a41-bf74-5212970b65bf,5,1733251141157,164b9146b00a9d2d,d284f4b9ee149ba6405bf6cd17e926d8,,
3,,,0.9,,,,,,,PPE stands for Personal Protective Equipment. ...,...,a9b86cba-a866-4e40-9a4e-c53880376afd,1733258753664,1733258753664,17a0b3c2-2673-4a41-bf74-5212970b65bf,5,1733251163954,53f7aa35f84ce647,90122c26a53590cf5c2a202ad33b407d,,


In [20]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_name = f"Verisk_Experiment_{timestamp}"

# Run experiment with smaller batch for testing
experiment = client.run_experiment(
    space_id=space_id,
    dataset_id=dataset_id,
    task=notebook_task,
    evaluators=[NotebookArizeEvaluator()],
    experiment_name=experiment_name,
)

[38;21m  arize.utils.logging | INFO | 🧪 Experiment started.[0m


running tasks |          | 0/4 (0.0%) | ⏳ 00:00<? | ?it/s2024-12-03 17:49:17,797 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5



Query found: when should I wear ppe?


2024-12-03 17:49:21,472 - sentence_transformers.SentenceTransformer - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,


Query found: should I have a fire extinguisher in my miata?


2024-12-03 17:49:46,954 - sentence_transformers.SentenceTransformer - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,


Query found: what is ppe?


2024-12-03 17:50:12,576 - sentence_transformers.SentenceTransformer - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,


Query found: what is ppe?


2024-12-03 17:50:39,646 - sentence_transformers.SentenceTransformer - INFO - 2 prompts are loaded, with the keys: ['query', 'text']
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,

[38;21m  arize.utils.logging | INFO | ✅ Task runs completed.
Tasks Summary (12/03/24 05:50 PM -0800)
---------------------------------------
   n_examples  n_runs  n_errors
0           4       4         0[0m



running experiment evaluations |██████████| 4/4 (100.0%) | ⏳ 00:00<00:00 | 14.48it/s

[38;21m  arize.utils.logging | INFO | ✅ All evaluators completed.[0m



