In [1]:
import os

# 환경변수에서 값을 가져와 설정
os.environ['LANGSMITH_TRACING'] = os.getenv('LANGSMITH_TRACING', 'true')
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY', '')

In [2]:
from langsmith import Client, wrappers
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT
from langchain_aws import ChatBedrockConverse

sonnet = ChatBedrockConverse(
    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    region_name="us-east-1",
    temperature=0,
)

client = Client()

In [11]:
from assets.evaluation_prompt import (
    completeness,
    correctness,
    helpfulness,
    following_instructions,
    logical_coherence,
    professional_tone,
    relevance,
    stereotyping,
    harmfulness,
    refusal,
    faithfulness,
)


# 올바른 시그니처를 가진 evaluator들
def logical_coherence_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=logical_coherence.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="logical_coherence",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def faithfulness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=faithfulness.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="faithfulness",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def following_instructions_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=following_instructions.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="following_instructions",
        choices=[0, 1]
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def completeness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=completeness.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="completeness",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=correctness.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="correctness",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def helpfulness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=helpfulness.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="helpfulness",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def professional_tone_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=professional_tone.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="professional_tone",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def relevance_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=relevance.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="relevance",
        continuous=True
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def stereotyping_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=stereotyping.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="stereotyping",
        choices=[0, 1]
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def harmfulness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=harmfulness.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="harmfulness",
        choices=[0, 1]
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result
def refusal_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    eval = create_llm_as_judge(
        prompt=refusal.sonnet_3_7,
        model="bedrock_converse:apac.anthropic.claude-3-7-sonnet-20250219-v1:0",
        feedback_key="refusal",
        choices=[0, 1]
    )
    eval_result = eval(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

def tps_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract TPS (Tokens Per Second) from outputs."""
    return {
        "key": "tps", 
        "score": outputs.get("tps", 0)
    }

def ttft_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract TTFT metric from outputs."""
    return {
        "key": "ttft_ms", 
        "score": outputs.get("ttft_ms", 0)
    }

def output_tokens_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract output token count from outputs."""
    return {
        "key": "output_tokens", 
        "score": outputs.get("output_tokens", 0)
    }

def input_tokens_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract input token count from outputs."""
    return {
        "key": "input_tokens", 
        "score": outputs.get("input_tokens", 0)
    }

def generation_time_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract generation time from outputs."""
    return {
        "key": "generation_time_s", 
        "score": outputs.get("generation_time_s", 0)
    }

def cost_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Extract cost from outputs."""
    return {
        "key": "total_cost", 
        "score": outputs.get("total_cost", 0)
    }

In [12]:
completeness.sonnet_3_7

'\nYou are an expert data labeler evaluating model outputs for completeness. Your task is to check if the candidate response contains the necessary amount of information and details for answering the question.\n\nPlease evaluate the completeness of the output based on the following criteria:\n\n1. Does the output address all parts of the input\'s request?\n2. Is any required information missing?\n3. For multi-part requests, are all parts fulfilled?\n4. Is the level of detail appropriate for the task?\n5. For specific requests (e.g., "list 10 items"), does the output meet the exact requirements?\n6. For summarization or rewriting tasks, are all main points covered?\n7. For step-by-step instructions, are all necessary steps included?\n8. Has any important information been omitted in editing or rewriting tasks?\n\nSpecial consideration for evasive or "I don\'t know" type responses:\n- If the output evades responding or claims lack of knowledge, assess whether this response is justified ba

In [13]:
from langchain_core.runnables import RunnableParallel, RunnableLambda
import time

def clean_streaming_response(raw_response: str) -> str:
    """스트리밍 응답에서 텍스트만 추출"""
    import re
    # {'type': 'text', 'text': '내용', 'index': 0} 패턴에서 text 부분만 추출
    pattern = r"'text': '([^']*?)'"
    matches = re.findall(pattern, raw_response)
    clean_text = ''.join(matches)
    # 이스케이프 문자 처리
    clean_text = clean_text.replace('\\n', '\n').replace('\\\\', '\\').replace("\\'", "'")
    return clean_text


def create_target_with_model(model, pricing_config):
    """특정 모델이 바인딩된 target 함수 생성"""
    def model_specific_target(inputs: dict) -> dict:
        # 원래 target 로직에 모델만 고정
        start_time = time.time()
        first_token_time = None
        
        stream = model.stream([
            {"role": "system", "content": "Answer the following question accurately"},
            {"role": "user", "content": inputs["question"]}
        ])
        
        full_response = ""
        usage_metadata = None
        response_metadata = None
        
        for chunk in stream:
            if first_token_time is None and hasattr(chunk, 'content') and chunk.content:
                first_token_time = time.time()
            
            if hasattr(chunk, 'content') and chunk.content:
                if isinstance(chunk.content, str):
                    full_response += chunk.content
                elif isinstance(chunk.content, list):
                    for item in chunk.content:
                        if hasattr(item, 'text'):
                            full_response += item.text
                        else:
                            full_response += str(item)
            
            if hasattr(chunk, 'usage_metadata') and chunk.usage_metadata:
                usage_metadata = chunk.usage_metadata
            
            if hasattr(chunk, 'response_metadata') and chunk.response_metadata:
                response_metadata = chunk.response_metadata
        
        end_time = time.time()
        
        ttft_ms = (first_token_time - start_time) * 1000 if first_token_time else None
        input_tokens = usage_metadata.get('input_tokens', 0) if usage_metadata else 0
        output_tokens = usage_metadata.get('output_tokens', 0) if usage_metadata else 0
        total_tokens = usage_metadata.get('total_tokens', 0) if usage_metadata else 0
        
        total_time_s = end_time - start_time
        generation_time_s = end_time - first_token_time if first_token_time else total_time_s
        tps = output_tokens / generation_time_s if generation_time_s > 0 and output_tokens > 0 else 0

        input_cost = (input_tokens / 1000) * pricing_config["input_price_per_ktok"]
        output_cost = (output_tokens / 1000) * pricing_config["output_price_per_ktok"]
        total_cost = input_cost + output_cost

        if "{'type': 'text'" in full_response:
            full_response = clean_streaming_response(full_response)
        
        return { 
            "answer": full_response.strip(),
            "ttft_ms": ttft_ms / 1000,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": total_tokens,
            "generation_time_s": generation_time_s,
            "total_time_s": total_time_s,
            "tps": tps,
            "input_cost": input_cost,
            "output_cost": output_cost,
            "total_cost": total_cost,

        }
    
    return model_specific_target

# 모델별로 순회하며 평가
model_configs = [
    {
        "name": "claude-3-7-sonnet",
        "model": ChatBedrockConverse(
            model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
            region_name="us-east-1",
            temperature=0,
        ),
        "input_price_per_ktok": 0.003,
        "output_price_per_ktok": 0.015,
        "dataset_name": "blynx-dataset-sonnet-3-7"
        # "dataset_name": "dummy-dataset-haiku-3-5"
    },
    {
        "name": "claude-3-haiku",
        "model": ChatBedrockConverse(
            model="us.anthropic.claude-3-haiku-20240307-v1:0",
            region_name="us-east-1", 
            temperature=0,
        ),
        "input_price_per_ktok": 0.0008,
        "output_price_per_ktok": 0.004,
        "dataset_name": "blynx-dataset-haiku-3-5"
        # "dataset_name": "dummy-dataset-haiku-3-5"
    }
]

for config in model_configs:
    # 각 모델별 target 생성
    model_target = create_target_with_model(config["model"], config)
    
    # 후처리를 원한다면 파이프라인 구성
    def add_model_info(result: dict) -> dict:
        result["model_name"] = config["name"]
        return result
    
    pipeline = RunnableLambda(model_target) | RunnableLambda(add_model_info)
    
    # 평가 실행
    experiment_results = client.evaluate(
        pipeline,
        data=config["dataset_name"],
        evaluators=[
            logical_coherence_evaluator,
            faithfulness_evaluator,
            following_instructions_evaluator,
            completeness_evaluator,
            correctness_evaluator,
            helpfulness_evaluator,
            professional_tone_evaluator,
            relevance_evaluator,
            stereotyping_evaluator,
            harmfulness_evaluator,
            refusal_evaluator,
            tps_evaluator,
            ttft_evaluator,
            generation_time_evaluator,
            input_tokens_evaluator,
            output_tokens_evaluator,
            cost_evaluator,
        ],
        experiment_prefix=f"{config['name']}",
        max_concurrency=2,
    )
    
    print(f"{config['name']} evaluation completed!")

View the evaluation results for experiment: 'claude-3-7-sonnet-bc180e0e' at:
https://smith.langchain.com/o/e6a0f2b6-1ca3-5300-ba13-c04e59aa4f5b/datasets/6494c41f-c31f-48ae-b670-a8e8be84f9fb/compare?selectedSessions=bb569416-830a-4a1a-8127-b71d524a80c5




1it [04:40, 280.04s/it]


claude-3-7-sonnet evaluation completed!
View the evaluation results for experiment: 'claude-3-haiku-fc0110a7' at:
https://smith.langchain.com/o/e6a0f2b6-1ca3-5300-ba13-c04e59aa4f5b/datasets/c0e76c89-b219-4d87-a634-fa94850d0c7f/compare?selectedSessions=3ac35a17-a5bc-4b54-8295-c1146d9821ba




1it [04:04, 244.48s/it]

claude-3-haiku evaluation completed!



