In [1]:
# Import necessary libraries
import pandas as pd
from arize.experimental.datasets import ArizeDatasetsClient
from arize.experimental.datasets.experiments.evaluators.base import (
    EvaluationResult,
    Evaluator,
)
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential
from pathlib import Path
import logging
import json
import os

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up environment variables for Azure OpenAI and Arize
os.environ["AZURE_OPENAI_ENDPOINT"] = "your_azure_endpoint"
os.environ["AZURE_OPENAI_API_VERSION"] = "2023-12-01-preview"
os.environ["ARIZE_SPACE_ID"] = "your_arize_space_id"
os.environ["ARIZE_API_KEY"] = "your_arize_api_key"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import necessary libraries
import pandas as pd
from arize.experimental.datasets import ArizeDatasetsClient
from arize.experimental.datasets.experiments.evaluators.base import (
    EvaluationResult,
    Evaluator,
)
from arize.experimental.datasets.utils.constants import GENERATIVE
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential
from pathlib import Path
import logging
import json
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Verify that required environment variables are present
required_vars = [
    "AZURE_OPENAI_ENDPOINT", 
    "AZURE_OPENAI_API_VERSION",
    "AZURE_OPENAI_DEPLOYMENT",
    "ARIZE_SPACE_ID",
    "ARIZE_API_KEY",
    "ARIZE_MODEL_ID"
]

missing_vars = [var for var in required_vars if not os.environ.get(var)]
if missing_vars:
    logger.warning(f"Missing required environment variables: {', '.join(missing_vars)}")
else:
    logger.info("All required environment variables are set")

# Print available deployments for reference
if "AZURE_OPENAI_DEPLOYMENT" in os.environ:
    logger.info(f"Current Azure OpenAI deployment: {os.environ['AZURE_OPENAI_DEPLOYMENT']}")

INFO:__main__:All required environment variables are set
INFO:__main__:Current Azure OpenAI deployment: gpt-4o


In [3]:
def create_azure_openai_client(deployment_name):
    """
    Creates an Azure OpenAI client for a specific deployment
    
    Args:
        deployment_name (str): The Azure OpenAI deployment name
        
    Returns:
        AzureOpenAI: Configured Azure OpenAI client
    """
    try:
        # Try to use API key if available
        api_key = os.environ.get("AZURE_OPENAI_API_KEY")
        if api_key:
            logger.info(f"Using API key authentication for {deployment_name}")
            client = AzureOpenAI(
                api_key=api_key,
                azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
                api_version=os.environ["AZURE_OPENAI_API_VERSION"]
            )
        else:
            # Use DefaultAzureCredential
            logger.info(f"Using DefaultAzureCredential for {deployment_name}")
            default_credential = DefaultAzureCredential()
            client = AzureOpenAI(
                azure_ad_token_provider=default_credential.get_token("https://cognitiveservices.azure.com/.default").token,
                azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
                api_version=os.environ["AZURE_OPENAI_API_VERSION"]
            )
        return client
    except Exception as e:
        logger.error(f"Failed to create Azure OpenAI client: {str(e)}")
        raise

In [4]:
def generate_response(query, model_client, deployment_name):
    """
    Generate a response using your existing classification and query logic
    
    Args:
        query (str): The user query
        model_client (AzureOpenAI): The Azure OpenAI client
        deployment_name (str): The deployment name
        
    Returns:
        dict: Response with metadata
    """
    try:
        # Classification prompt from your config
        classification_prompt = """You are a query classifier for Assurant's 10-K reports and risk assessment application. 
        Analyze the following query and respond with a JSON object containing two fields:
        1. 'category': Must be exactly one of: "assurant_10k", "risk_assessment", or "out_of_scope"
        2. 'confidence': A number between 0 and 1 indicating your confidence in the classification
        
        Query: {query}
        
        Respond with ONLY a valid JSON object in this exact format:
        {{"category": "<category>", "confidence": <confidence>}}"""
        
        # Get classification
        classification_response = model_client.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": classification_prompt.format(query=query)},
                {"role": "user", "content": query}
            ],
            temperature=0
        )
        
        classification_text = classification_response.choices[0].message.content
        classification = json.loads(classification_text.strip())
        
        # Define RAG prompt for responses
        rag_prompt = """You are a financial analyst specializing in insurance companies with expert knowledge of Assurant's recent 10-K reports. Provide a clear, accurate answer based on the context.
        
        Question: {query}
        
        When applicable, cite specific sections, page numbers, or fiscal years from the 10-K reports."""
        
        # Generate response
        response = model_client.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": rag_prompt.format(query=query)},
                {"role": "user", "content": query}
            ],
            temperature=0.2
        )
        
        return {
            "query": query,
            "response": response.choices[0].message.content,
            "category": classification["category"],
            "confidence": classification["confidence"],
            "model": deployment_name
        }
    except Exception as e:
        logger.error(f"Error generating response: {str(e)}")
        return {
            "query": query,
            "response": f"Error: {str(e)}",
            "category": "error",
            "confidence": 0.0,
            "model": deployment_name
        }

In [5]:
def create_experiment_task(deployment_name):
    """
    Create an Arize experiment task for a specific model deployment
    
    Args:
        deployment_name (str): The Azure OpenAI deployment name
        
    Returns:
        ExperimentTask: Configured experiment task
    """
    client = create_azure_openai_client(deployment_name)
    
    def task_function(df):
        """Process each row in the dataset"""
        results = []
        for _, row in df.iterrows():
            query = row["query"]
            response_data = generate_response(query, client, deployment_name)
            results.append(response_data)
        return pd.DataFrame(results)
    
    return ExperimentTask(
        name=f"azure-openai-{deployment_name}",
        function=task_function
    )

In [10]:
# Import necessary libraries for Phoenix evaluations
import asyncio
import nest_asyncio
import pandas as pd
from phoenix.evals import llm_classify
from phoenix.evals.models import OpenAIModel
from arize.experimental.datasets.experiments.evaluators.base import (
    EvaluationResult,
    Evaluator,
)

# Apply nest_asyncio to allow running asyncio in Jupyter notebook
nest_asyncio.apply()

# Define custom hallucination evaluation prompt
HALLUCINATION_PROMPT = """In this task, you will be presented with a query, a reference text and an answer. The answer is
generated to the question based on the reference text. The answer may contain false information. You
must use the reference text to determine if the answer to the question contains false information,
if the answer is a hallucination of facts. Your objective is to determine whether the answer text
contains factual information and is not a hallucination. A 'hallucination' refers to
an answer that is not based on the reference text or assumes information that is not available in
the reference text. Your response should be a single word: either "factual" or "hallucinated", and
it should not include any other text or characters. "hallucinated" indicates that the answer
provides factually inaccurate information to the query based on the reference text. "factual"
indicates that the answer to the question is correct relative to the reference text, and does not
contain made up information. Please read the query and reference text carefully before determining
your response.

    # Query: {query}
    # Reference text: {reference}
    # Answer: {response}
    Is the answer above factual or hallucinated based on the query and reference text?"""

class HallucinationEvaluator(Evaluator):
    def __init__(self, azure_client, deployment_name):
        self.client = azure_client
        self.deployment_name = deployment_name
        self.rails = ["hallucinated", "factual"]
        
    async def async_llm_classify(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        # Create an OpenAIModel using the Azure client's credentials
        azure_api_key = self.client._api_key if hasattr(self.client, '_api_key') else None
        azure_endpoint = self.client._azure_endpoint if hasattr(self.client, '_azure_endpoint') else None
        
        # Create a model with the appropriate configuration
        model = OpenAIModel(
            model=self.deployment_name,
            api_key=azure_api_key,
            api_base=azure_endpoint,
            api_type="azure",
            api_version=self.client._api_version if hasattr(self.client, '_api_version') else None
        )
        
        result = await asyncio.to_thread(
            llm_classify,
            dataframe=dataframe,
            model=model,
            template=HALLUCINATION_PROMPT,
            rails=self.rails,
            run_sync=False,
            concurrency=5,
            provide_explanation=True
        )
        return result

    def evaluate(self, output: str, dataset_row: dict, **kwargs) -> EvaluationResult:
        # Build the dataframe
        df = pd.DataFrame([{
            "query": dataset_row.get("query"),
            "response": output
        }])

        # Run our async classification in a blocking manner
        results = asyncio.run(self.async_llm_classify(df))
        label = results['label'].iloc[0]
        score = int(label)  # Convert string to integer

        return EvaluationResult(
            score=score,
            label=label,
            metadata={},
            explanation=results['explanation'].iloc[0] if 'explanation' in results.columns else ""
        )

def create_evaluators(azure_client, evaluation_deployment):
    """
    Create Phoenix-based hallucination evaluator for assessing model responses
    
    Args:
        azure_client: The Azure OpenAI client to use for evaluations
        evaluation_deployment: The deployment to use for evaluations
        
    Returns:
        Evaluators: Configured evaluators
    """
    hallucination = HallucinationEvaluator(azure_client, evaluation_deployment)
    
    return Evaluators([
        ("hallucination", hallucination)
    ])

In [11]:
# Main execution section
if __name__ == "__main__":
    # Define which models to evaluate
    deployments = [
        "gpt-4",
        "gpt-4-turbo",
        "gpt-35-turbo"
    ]
    
    # Set up experiment parameters
    space_id = os.environ["ARIZE_SPACE_ID"]
    experiment_name = "azure-openai-model-comparison"
    dataset_name = "assurant-10k-queries"
    
    # Define which model to use for evaluation
    evaluation_deployment = "gpt-4"  # Using GPT-4 as the judge
    
    # Create Azure client for evaluations
    eval_client = create_azure_openai_client(evaluation_deployment)
    
    # Create hallucination evaluator
    evaluators = create_evaluators(eval_client, evaluation_deployment)
    
    experiment_ids = {}
    
    # Run experiment for each model
    for deployment in deployments:
        # Create the experiment task for this deployment
        task = create_experiment_task(deployment)
        
        # Run the experiment directly
        experiment_id, _ = run_experiment(
            space_id=space_id,
            experiment_name=f"{experiment_name}-{deployment}",
            task=task,
            dataset_name=dataset_name,
            evaluators=evaluators,
            dry_run=False,
            concurrency=3
        )
        
        experiment_ids[deployment] = experiment_id
        logger.info(f"Completed experiment for {deployment}: {experiment_id}")
    
    # Print URLs for viewing results in Arize
    print(f"\nView experiment results in Arize:")
    for deployment, exp_id in experiment_ids.items():
        print(f"  {deployment}: https://app.arize.com/spaces/{space_id}/experiments/{exp_id}")

INFO:__main__:Using API key authentication for gpt-4


NameError: name 'Evaluators' is not defined