In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

print(f"[API KEY]\n{os.environ['OPENAI_API_KEY']}")
print(os.environ["LANGCHAIN_TRACING_V2"])

[API KEY]
sk-proj-2ALiWBzcJl4s9ri6EUJ6T3BlbkFJxbUDIanlzbIf6JMPE7o2
true


In [3]:
import torch
import os
import uuid

from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts.chat import PromptTemplate, ChatPromptTemplate

import numpy as np
import matplotlib.pyplot as plt

from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline
import transformers

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from langchain_community.chat_models import ChatOllama
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.callbacks.manager import CallbackManager

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from bert_score import score
# import geval
# import semscore
from transformers import BartForConditionalGeneration, BartTokenizer
import json

# JSON 파일에서 결과를 불러오는 함수
def load_results(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# BERTScore 계산 함수
def calculate_bertscore(candidates, references):
    P, R, F1 = score(candidates, references, lang="ko", rescale_with_baseline=True)
    return F1

# G-Eval 계산 함수
def calculate_geval(references, candidates):
    geval_scores = geval.evaluate(references, candidates)
    return geval_scores['scores']

# SEMScore 계산 함수
def calculate_semscore(references, candidates):
    sem_scores = semscore.evaluate(references, candidates)
    return sem_scores

# BARTScore 계산 함수
def calculate_bartscore(references, candidates, model, tokenizer):
    bart_scores = []
    for candidate, reference in zip(candidates, references):
        inputs = tokenizer(candidate, return_tensors="pt", max_length=512, truncation=True)
        outputs = model(**inputs)
        score = outputs.logits.mean().item()
        bart_scores.append(score)
    return bart_scores

# 메트릭 계산 및 JSON 업데이트 함수
def calculate_metrics_and_update(filepath, output_filepath):
    # JSON 파일에서 결과 불러오기
    results = load_results(filepath)
    
    # 정답 및 생성된 답변 리스트 만들기
    references = [result['correct_answer'] for result in results]
    candidates = [result['generated_answer'] for result in results]

    # 각 메트릭 점수 계산
    bert_scores = calculate_bertscore(references, candidates)
    # geval_scores = calculate_geval(references, candidates)
    # sem_scores = calculate_semscore(references, candidates)
    
    # BART 모델 준비
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
    bart_scores = calculate_bartscore(references, candidates, model, tokenizer)

    # 각 결과에 메트릭 점수를 추가
    for idx, result in enumerate(results):
        result['bertscore'] = bert_scores[idx].item()
        # result['geval'] = geval_scores[idx]
        # result['semscore'] = sem_scores[idx]
        result['bartscore'] = bart_scores[idx]

    # JSON 파일에 저장
    with open(output_filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    # 평균 점수 계산 및 출력
    avg_bertscore = sum(bert_scores) / len(bert_scores)
    # avg_geval = sum(geval_scores) / len(geval_scores)
    # avg_semscore = sum(sem_scores) / len(sem_scores)
    avg_bartscore = sum(bart_scores) / len(bart_scores)

    print(f"BERTScore 평균 점수: {avg_bertscore:.4f}")
    # print(f"G-Eval 평균 점수: {avg_geval:.4f}")
    # print(f"SEMScore 평균 점수: {avg_semscore:.4f}")
    print(f"BARTScore 평균 점수: {avg_bartscore:.4f}")

In [6]:
# LLM 불러오기 (GPT-4o 사용)
llm = ChatOpenAI(model_name="gpt-4o", temperature=0.0)

# 출력 파싱을 위한 클래스를 정의합니다.
class AnswerParser(StrOutputParser):
    def parse(self, output: str) -> str:
        return output.strip()

# 평가 프롬프트 템플릿 생성
answer_prompt = ChatPromptTemplate.from_template("""
You will be given a question, the correct answer, and a generated answer.
Evaluate how well the generated answer matches the correct answer and provide a result.
The answer should be either 'Correct' or 'Fail'.

If the generated answer is identical to the correct answer, including any numbers, respond with 'Correct'.
If the generated answer contains any differences, even in numbers, respond with 'Fail'. 
Do not provide any other response.

Question: {question}
Correct Answer: {correct_answer}
Generated Answer: {generated_answer}

Evaluation: 
""")

# 체인 생성
A_chain = answer_prompt | llm | AnswerParser()

# 질문, 정답, 생성된 답변을 넣어서 GPT 평가 진행
def evaluate_answers_with_gpt(questions, correct_answers, generated_answers):
    evaluation_results = []
    correct_count = 0  # Correct 평가 개수 카운트

    for idx, (question, correct_answer, generated_answer) in enumerate(zip(questions, correct_answers, generated_answers)):
        inputs = {
            "question": question,
            "correct_answer": correct_answer,
            "generated_answer": generated_answer
        }

        # 진행상황 출력 (몇 번째 평가 중인지)
        print(f"Evaluating Question {idx + 1}/{len(questions)}...")

        # GPT 평가 체인을 통해 결과를 생성
        evaluation_result = A_chain.invoke(inputs)
        evaluation_results.append(evaluation_result)

        # 평가 결과 출력
        print(f"Result for Question {idx + 1}: {evaluation_result}")

        # 'Correct'인 경우 카운트 증가
        if evaluation_result == 'Correct':
            correct_count += 1

    # 전체 평가 개수와 정확한 답변의 개수를 바탕으로 정답률 계산
    total_count = len(questions)
    accuracy = correct_count / total_count * 100  # 정답률 계산 (%로 표시)

    return evaluation_results, accuracy

# 메트릭 계산 및 JSON 업데이트 함수
def calculate_metrics_and_update(filepath, output_filepath):
    # JSON 파일에서 결과 불러오기
    print("Loading results from JSON file...")
    results = load_results(filepath)
    
    # 질문, 정답, 생성된 답변 리스트 만들기
    questions = [result['question'] for result in results]
    correct_answers = [result['correct_answer'] for result in results]
    generated_answers = [result['generated_answer'] for result in results]

    # GPT 평가를 통해 Correct/Fail 결과와 정답률 계산
    print("Starting evaluation with GPT...")
    evaluation_results, accuracy = evaluate_answers_with_gpt(questions, correct_answers, generated_answers)

    # 각 결과에 평가 점수('Correct' 또는 'Fail') 추가
    for idx, result in enumerate(results):
        result['gpt_evaluation'] = evaluation_results[idx]

    # JSON 파일에 저장
    print(f"Saving results to {output_filepath}...")
    with open(output_filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    # 정답률 출력
    print(f"정답률: {accuracy:.2f}%")

In [14]:
import json
model_names = [
    # '768_First',
    'EM_MNRL_MRL',
    'EM_klue_nli',
    # 'EM_klue_vanilla',
    # 'EM_open_3',
    # 'bge_m3',
    # 'e5_large',
]
for mo_name in model_names:
# BERTScore 점수
    model_name = mo_name

    filepath = f'/home/choi/Git/RAG_con_doc/langchain/{model_name}_results.json'

    output_filepath = f'/home/choi/Git/RAG_con_doc/langchain/{model_name}_final_results_with_G-eval.json'

    # 메트릭 계산 및 JSON 업데이트
    calculate_metrics_and_update(filepath, output_filepath)

    # calculate_metrics_and_update(filepath, output_filepath)

Loading results from JSON file...
Starting evaluation with GPT...
Evaluating Question 1/1255...
Result for Question 1: Correct
Evaluating Question 2/1255...
Result for Question 2: Fail
Evaluating Question 3/1255...
Result for Question 3: Correct
Evaluating Question 4/1255...
Result for Question 4: Correct
Evaluating Question 5/1255...
Result for Question 5: Correct
Evaluating Question 6/1255...
Result for Question 6: Fail
Evaluating Question 7/1255...
Result for Question 7: Fail
Evaluating Question 8/1255...
Result for Question 8: Fail
Evaluating Question 9/1255...
Result for Question 9: Correct
Evaluating Question 10/1255...
Result for Question 10: Correct
Evaluating Question 11/1255...
Result for Question 11: Correct
Evaluating Question 12/1255...
Result for Question 12: Fail
Evaluating Question 13/1255...
Result for Question 13: Correct
Evaluating Question 14/1255...
Result for Question 14: Fail
Evaluating Question 15/1255...
Result for Question 15: Correct
Evaluating Question 16/12

In [17]:
# JSON 파일에서 결과를 불러오는 함수
def load_results(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 정답률 계산 함수
def calculate_accuracy_from_json(filepath):
    # JSON 파일에서 결과 불러오기
    results = load_results(filepath)
    
    # Correct/Fail 결과 분석
    total_count = len(results)
    correct_count = sum(1 for result in results if result.get('gpt_evaluation') == 'Correct')

    # 정답률 계산
    accuracy = (correct_count / total_count) * 100

    # 결과 출력
    print(f"총 질문 개수: {total_count}")
    print(f"정답 개수: {correct_count}")
    print(f"정답률: {accuracy:.2f}%")

    return accuracy

# JSON 파일에서 정답률 계산 실행
calculate_accuracy_from_json('/home/choi/Git/RAG_con_doc/langchain/EM_klue_nli_final_results_with_G-eval.json')

총 질문 개수: 1255
정답 개수: 638
정답률: 50.84%


50.83665338645418