## LLM Validation walkthrough

Here we demonstrate a sample process of running the LLM model validation. The steps are as follows:
1. Give task specifications
2. Download dataset + prompt + model
3. Data ETL to align input/output formats
4. Define relevant components (customize if necessary)
5. Execute the orchestration process
6. Analyze the results

## Step 1: Task Specification

In [1]:
import os
from typing import List

import openai

from llm_validation.components.clients import Client
from llm_validation.app.configs import ClientConfig


class OpenAiClient(Client):
    """
    Note that the way we use anyscale endpoints are similiar to openai so we temporarily consider openai
    """

    def __init__(self, config: ClientConfig):
        super().__init__(config)
        self.api_key = os.getenv("OPENAI_API_KEY")
        self.model_name = config.model_name
        self.model_options = config.model_options

    async def predict_stream(self, messages: List):
        client = openai.OpenAI(api_key=self.api_key)
        stream = client.chat.completions.create(
            model=self.model_name,
            messages=messages,
            **self.model_options,
            stream=True,
            stream_options={"include_usage": True},
        )
        for chunk in stream:
            import pdb
            pdb.set_trace()
            if not chunk.choices:
                continue
            yield dict(
                text=chunk.choices[0].delta.content,
                raw_response=chunk,
            )


class CodeGenAccuracy(AccuracyMetric):
    def __init__(self, config: MetricConfig):
        super().__init__(config)
        client_config = ClientConfig(
            name="openai",
            type="research",
            model_name="GPT4",
            base_url="",
            model_options={"temperature": 0, "top_p": 1, "max_tokens": 1024},
        )
        prompt_config = PromptConfig(
            name="codegen-judge",
            path="prompts/judge.yml",
            version=1,
        )
        self.client = OpenAiClient(client_config)
        self.prompt = Prompt(prompt_config)

    def grade(self, input, output: str, label: str):
        messages = self.prompt.transform(
            generated_code_answer=output, expected_code_answer=label
        )
        try:
            result_content = self.client.sync_predict(messages)
            result_content = json.loads(result_content["text"])
            reason = result_content["reason"]
            code_quality = result_content["code_quality"]
            response_quality = result_content["response_quality"]
        except Exception as e:
            print(e)
            reason = "error"
            code_quality = "wrong"
            response_quality = "bad"
        return {
            "reason": reason,
            "code_quality": code_quality,
            "response_quality": response_quality,
        }

    def aggregate(self):
        code_quality = self.scores["code_quality"]
        response_quality = self.scores["response_quality"]
        self.stats.update(dict(Counter(code_quality)))
        self.stats.update(dict(Counter(response_quality)))

```
def init_research_client(config: ClientConfig) -> Client:
    if config.name == "anthropic":
        return AnthropicClient(config)
    elif config.name == "bedrock":
        return BedrockClient(config)
    elif config.name == "openai":
        return OpenAiClient(config)
    elif config.name == "together":
        return TogetherClient(config)
    elif config.name == "vertex":
        return VertexAiClient(config)
    else:
        raise ValueError(f"Research client type not supported: {config.name}")
```

In [None]:
``