# 4. Experiment

## Load Environment Variables

In [1]:
from dotenv import load_dotenv

# 환경 변수 로드
load_dotenv(dotenv_path=".env", override=True)

True

## Setup

### Langfuse

In [2]:
from langfuse import get_client
from langfuse.langchain import CallbackHandler

langfuse = get_client()
langfuse_handler = CallbackHandler()

# 연결 확인
if langfuse.auth_check():
    print("Langfuse client is authenticated and ready!")
else:
    print("Authentication failed. Please check your credentials and host.")

Langfuse client is authenticated and ready!


### LLM

Setup llm model to chat.

In [3]:
import os
from langchain_openai import ChatOpenAI

# 환경 변수에서 설정 가져오기
model_name = os.environ["MODEL_NAME"]
openai_api_key = os.environ["OPENAI_API_KEY"]
openai_api_base = os.environ["OPENAI_API_BASE"]

# LLM 모델 초기화
llm = ChatOpenAI(
    model_name=model_name,
    openai_api_key=openai_api_key,
    openai_api_base=openai_api_base,
)

### Tavily
Let's set up a tool called Tavily to allow our assistant to search the web when answering.  
Go to [website](https://app.tavily.com/) and get api key.

In [4]:
from langchain_tavily import TavilySearch

# Tavily 검색 도구 설정 (최대 1개 결과)
web_search_tool = TavilySearch(max_results=1)

## Prompt

Let's design a prompt for RAG that we'll use throughout the notebook.

In [5]:
prompt = """You are a professor and expert in explaining complex topics in a way that is easy to understand. 
Your job is to answer the provided question so that even a 5 year old can understand it. 
You have provided with relevant background context to answer the question.

Question: {question} 

Context: {context}

Answer:"""
print("Prompt Template: ", prompt)

Prompt Template:  You are a professor and expert in explaining complex topics in a way that is easy to understand. 
Your job is to answer the provided question so that even a 5 year old can understand it. 
You have provided with relevant background context to answer the question.

Question: {question} 

Context: {context}

Answer:


## Application Using LangGraph
Let's define the State for our Graph. We'll track the user's question, our application's generation, and the list of relevant documents

In [6]:
from langchain.schema import Document
from typing import List
from typing_extensions import TypedDict

class GraphState(TypedDict):
    """
    그래프의 상태를 나타냅니다.
    """
    question: str
    documents: List[str]
    messages: List[str]

Great, now let's define the nodes of our graph

In [7]:
from langchain_core.messages import HumanMessage

def search(state):
    """
    질문을 기반으로 웹 검색을 수행합니다.

    Args:
        state (dict): 현재 그래프 상태

    Returns:
        state (dict): 웹 검색 결과가 추가된 documents 키로 업데이트된 상태
    """
    question = state["question"]
    documents = state.get("documents", [])

    # 웹 검색 수행
    web_docs = web_search_tool.invoke({"query": question})
    web_results = "\n".join([d["content"] for d in web_docs["results"]])
    web_results = Document(page_content=web_results)
    documents.append(web_results)

    return {"documents": documents, "question": question}


def explain(state: GraphState):
    """
    컨텍스트를 기반으로 응답을 생성합니다.
    
    Args:
        state (dict): 현재 그래프 상태
        
    Returns:
        state (dict): LLM 생성 결과가 포함된 messages 키가 추가된 상태
    """
    question = state["question"]
    documents = state.get("documents", [])
    formatted = prompt.format(
        question=question, 
        context="\n".join([d.page_content for d in documents])
    )
    generation = llm.invoke([HumanMessage(content=formatted)])
    return {"question": question, "messages": [generation]}

In [8]:
from langgraph.graph import StateGraph, START, END

# 상태 그래프 생성
graph = StateGraph(GraphState)

# 노드 추가
graph.add_node("explain", explain)
graph.add_node("search", search)

# 엣지 추가
graph.add_edge(START, "search")
graph.add_edge("search", "explain")
graph.add_edge("explain", END)

# 그래프 컴파일
app = graph.compile()

### Define Evaluators

#### Custom Code Evaluator

We'll first define a custom code evaluator, which are useful to measure deterministic or close-ended metrics. 

In [10]:
def conciseness(outputs) -> bool:
    words = outputs.split(" ")
    return len(words) <= 200

This particular custom code evaluator is a simple Python function that checks if our application produces outputs that are less than or equal to 200 words long.

#### LLM-as-a-Judge Evaluator

For open-ended metrics, it's can be powerful to use an LLM to score the outputs.

Let's use an LLM to check whether our application produces correct outputs. First, let's define a scoring schema for our LLM to adhere to in its response.

In [25]:
from pydantic import BaseModel, Field


# Define a scoring schema that our LLM must adhere to
class CorrectnessScore(BaseModel):
    """Correctness score of the answer when compared to the reference answer."""

    score: int = Field(
        description="The score of the correctness of the answer, from 0 to 1"
    )

We'll define a function to give an LLM our application's outputs, alongside the reference outputs stored in our dataset. 

The LLM will then be able to reference the "right" output to judge if our application's answer meets our accuracy standards.

In [26]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage


def correctness(question, output, reference_output) -> bool:
    prompt = """
    You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:

    <Rubric>
        A correct answer:
        - Provides accurate information
        - Uses suitable analogies and examples
        - Contains no factual errors
        - Is logically consistent

        When scoring, you should penalize:
        - Factual errors
        - Incoherent analogies and examples
        - Logical inconsistencies
    </Rubric>

    <Instructions>
        - Carefully read the input and output
        - Use the reference output to determine if the model output contains errors
        - Focus whether the model output uses accurate analogies and is logically consistent
    </Instructions>

    <Reminder>
        The analogies in the output do not need to match the reference output exactly. Focus on logical consistency.
    </Reminder>

    <input>
        {}
    </input>

    <output>
        {}
    </output>

    Use the reference outputs below to help you evaluate the correctness of the response:
    <reference_outputs>
        {}
    </reference_outputs>
    """.format(
        question, output, reference_output
    )

    structured_llm = ChatOpenAI(
        model_name=model_name,
        openai_api_key=openai_api_key,
        openai_api_base=openai_api_base,
        temperature=0
    ).with_structured_output(CorrectnessScore)
    generation = structured_llm.invoke([HumanMessage(content=prompt)])
    return generation.score == 1

## Run Experiment

We have all the necessary components, so let's run our experiment! 

In [27]:
# Step 1: Define evaluation dataset
dataset_name = "dataset-exmaple"
dataset = langfuse.get_dataset(name=dataset_name)

In [28]:
# Step 2: Define run
# Assume 'run' is your instrumented application function
def run(question):
    with langfuse.start_as_current_generation(name="qna-llm-call") as generation:
        # LLM call
        response = app.invoke({"question": question})
        output = response["messages"][0].content
        # Update the trace with the input and output
        generation.update_trace(
            input=question,
            output=output,
        )
        return output

In [29]:
# Step 3: Run evaluation
from uuid import uuid4

suffix = str(uuid4())[:8]
current_run_name = f"demo-run-{suffix}"  # Identifies this specific evaluation run


for item in dataset.items:
    # Use the item.run() context manager
    with item.run(
        run_name=current_run_name, run_description="Evaluation run for tutorial"
    ) as root_span:  # root_span is the root span of the new trace for this item and run.
        # All subsequent langfuse operations within this block are part of this trace.

        # Call your application logic
        question = item.input["question"]
        generated_answer = run(question=question)

        # metrics
        conciseness_score = conciseness(generated_answer)
        root_span.score(
            name="conciseness",
            value=conciseness_score,
            data_type="NUMERIC",
        )

        correctness_score = correctness(
            question=question,
            output=generated_answer,
            reference_output=item.expected_output["text"],
        )
        root_span.score(
            name="correctness",
            value=correctness_score,
            data_type="NUMERIC",
        )