### Pairwise Evaluation
- 두 개 이상의 LLM 생성물을 서로 비교한다.

In [None]:
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def evaluate_pairwise(runs: list, example) -> dict:

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i
    
    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        
        # Question:
        {question}
        
        #Answer A: 
        {answer_a}
        
        #Answer B: 
        {answer_b}
        
        Output should be either `A` or `B`. Pick the answer that is better.
        
        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    score = answer_grader.invoke(
        {
            "question" : question,
            "answer_a" : answer_a,
            "answer_b" : answer_b
        }
    )

    if score == "A": # A가 더 답변을 잘했다.
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B": # B가 더 답변을 잘했다.
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key" : "ranked preference", "scores": scores}
        

In [8]:
!pip install langchain-community

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp312-cp312-win_amd64.whl.metadata (9.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.10.10-cp312-cp312-win_amd64.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting numpy<2.0.0,>=1.26.0 (from langchain-community)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting aiohappyeye


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [9]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI

def ask_question_with_llm(llm):

    rag = PDFRAG(
        "data/snow-white.pdf",
        llm
    )

    retriever = rag.create_retriever()

    rag_chain= rag.create_chain(retriever)

    def _ask_question(inputs:dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question" : inputs["question"],
            "context" : context,
            "answer" : rag_chain.invoke(inputs["question"])
        }
    
    return _ask_question

In [10]:
from langchain_openai import ChatOpenAI

gpt3 = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

gpt3.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 무엇을 도와드릴까요?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a8d774bc-e137-45b6-b866-913df0cd60ea-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [None]:
! ollama run gemma2:9b

- Ollama 사용 시 참고

In [12]:
from langchain_ollama import ChatOllama

# Ollama 모델을 불러옵니다.
ollama = ChatOllama(model="gemma2")

# Ollama 모델 호출
ollama.invoke("안녕하세요?")

ResponseError: model "gemma2" not found, try pulling it first

In [None]:
gpt4o_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt3_chain = ask_question_with_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0))

# Ollama 사용시
# Ollama_chain = ask_question_with_llm(ChatOllama(model=""))

In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)}, # 평가자
    prepare_data=lambda run, example: {
        "prediction" : run.outputs["answer"],
        "reference" : run.outputs["context"],
        "input" : run.outputs["question"]
    }
)

dataset_name = "RAG_EVALUATION_DATASET"

experiment_result1 = evaluate(
    gpt3_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-3.5-turbo 평가 (cot_qa)"
    }
)

experiment_result2 = evaluate(
    gpt4o_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)"
    }
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-dc95194a' at:
https://smith.langchain.com/o/3d77296a-ff2a-4050-92a5-2134a9d7c218/datasets/e8076362-122d-4fee-bcdf-ee5d5f1567dd/compare?selectedSessions=239a1386-30a7-4fee-b466-7c0b37040259




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-1cb18cb8' at:
https://smith.langchain.com/o/3d77296a-ff2a-4050-92a5-2134a9d7c218/datasets/e8076362-122d-4fee-bcdf-ee5d5f1567dd/compare?selectedSessions=4318af91-f476-4de0-bae4-35c03228c69c




0it [00:00, ?it/s]

In [None]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    ["MODEL_COMPARE_EVALUATION-beec8c68","MODEL_COMPARE_EVALUATION-4131ebe2"],

    # 평가자
    evaluators=[evaluate_pairwise]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/3d77296a-ff2a-4050-92a5-2134a9d7c218/datasets/e8076362-122d-4fee-bcdf-ee5d5f1567dd/compare?selectedSessions=74c2ab60-ec24-4320-b0c7-1866e1b91966%2Cc8c48bae-8f92-47d7-99d5-341dd2e8a293&comparativeExperiment=ac6d139d-4def-4365-88d1-6863978a767d




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1c2baeb1110>