In [1]:
import openai
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os

# .env 파일에서 OpenAI API 키 로드
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [2]:
# CSV 파일 읽기
csv_file_path = 'qa_results.csv'
df = pd.read_csv(csv_file_path)

# 평가 함수 정의
def evaluate_conversation(question, gpt4o_answer, gemma2_fine_answer):
    messages = [
        {"role": "system", "content": "당신은 대화 평가 전문가입니다."},
        {"role": "user", "content": f"""
        주어진 질문과 두 대답(GPT-4.0 및 Gemma2.0)을 평가해주세요. 평가 기준은 정확성, 관련성, 완전성, 명확성 네 가지입니다.

        질문: "{question}"

        GPT-4.0의 답변: "{gpt4o_answer}"
        Gemma2.0의 답변: "{gemma2_fine_answer}"

        각 평가 기준에 대해 점수를 매겨주세요 (1-10):
        1. 정확성 (Accuracy): 
        2. 관련성 (Relevance): 
        3. 완전성 (Completeness): 
        4. 명확성 (Clarity): 
        """}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=1024
    )

    return response.choices[0].message['content'].strip()

# 대화 평가
results = []
accuracy_scores = []
relevance_scores = []
completeness_scores = []
clarity_scores = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    question = row['question']
    gpt4o_answer = row['gpt4oAnswer']
    gemma2_fine_answer = row['gemma2FineAnswer']
    
    evaluation = evaluate_conversation(question, gpt4o_answer, gemma2_fine_answer)
    results.append(evaluation)

    # 점수 추출
    lines = evaluation.split('\n')
    if len(lines) >= 5:
        try:
            accuracy = int(lines[1].split(': ')[1])
            relevance = int(lines[2].split(': ')[1])
            completeness = int(lines[3].split(': ')[1])
            clarity = int(lines[4].split(': ')[1])
        except (IndexError, ValueError) as e:
            print(f"Error parsing evaluation for index {index}: {e}")
            accuracy = relevance = completeness = clarity = 0
    else:
        print(f"Incomplete evaluation response for index {index}")
        accuracy = relevance = completeness = clarity = 0
    
    accuracy_scores.append(accuracy)
    relevance_scores.append(relevance)
    completeness_scores.append(completeness)
    clarity_scores.append(clarity)

# 평가 결과 저장
df['evaluation'] = results
df['accuracy'] = accuracy_scores
df['relevance'] = relevance_scores
df['completeness'] = completeness_scores
df['clarity'] = clarity_scores
df.to_csv('qa_evaluation_results.csv', index=False)

# 화면에 점수 출력
print("\n평가 완료. 평가 결과는 'qa_evaluation_results.csv' 파일에 저장되었습니다.")
print(f"정확성 (평균): {sum(accuracy_scores) / len(accuracy_scores):.2f}")
print(f"관련성 (평균): {sum(relevance_scores) / len(relevance_scores):.2f}")
print(f"완전성 (평균): {sum(completeness_scores) / len(completeness_scores):.2f}")
print(f"명확성 (평균): {sum(clarity_scores) / len(clarity_scores):.2f}")


  1%|          | 1/100 [00:07<12:55,  7.83s/it]

Error parsing evaluation for index 0: list index out of range


  2%|▏         | 2/100 [00:12<10:03,  6.16s/it]

Error parsing evaluation for index 1: list index out of range


  3%|▎         | 3/100 [00:17<09:04,  5.61s/it]

Error parsing evaluation for index 2: list index out of range


  4%|▍         | 4/100 [00:23<09:11,  5.75s/it]

Error parsing evaluation for index 3: list index out of range


  5%|▌         | 5/100 [00:30<09:26,  5.96s/it]

Error parsing evaluation for index 4: invalid literal for int() with base 10: '9 - 다양한 사이즈 선택에 대해 다양한 방법을 제안하고 있으며, 실제 매장 방문이 가장 좋은 방법이라는 점도 강조하고 있습니다.'


  6%|▌         | 6/100 [00:34<08:46,  5.60s/it]

Error parsing evaluation for index 5: list index out of range


  7%|▋         | 7/100 [00:42<09:45,  6.30s/it]

Error parsing evaluation for index 6: list index out of range


  8%|▊         | 8/100 [00:45<08:10,  5.34s/it]

Error parsing evaluation for index 7: list index out of range


 10%|█         | 10/100 [00:56<08:22,  5.58s/it]

Error parsing evaluation for index 9: list index out of range


 11%|█         | 11/100 [01:00<07:09,  4.83s/it]

Error parsing evaluation for index 10: list index out of range


 12%|█▏        | 12/100 [01:04<06:54,  4.71s/it]

Error parsing evaluation for index 11: list index out of range


 13%|█▎        | 13/100 [01:13<08:35,  5.93s/it]

Error parsing evaluation for index 12: list index out of range


 14%|█▍        | 14/100 [01:15<07:03,  4.92s/it]

Error parsing evaluation for index 13: list index out of range


 15%|█▌        | 15/100 [01:19<06:22,  4.50s/it]

Error parsing evaluation for index 14: list index out of range


 16%|█▌        | 16/100 [01:22<05:52,  4.20s/it]

Error parsing evaluation for index 15: list index out of range


 17%|█▋        | 17/100 [01:24<04:57,  3.58s/it]

Error parsing evaluation for index 16: list index out of range


 18%|█▊        | 18/100 [01:27<04:37,  3.39s/it]

Error parsing evaluation for index 17: list index out of range


 19%|█▉        | 19/100 [01:30<04:18,  3.19s/it]

Error parsing evaluation for index 18: list index out of range


 20%|██        | 20/100 [01:34<04:26,  3.33s/it]

Error parsing evaluation for index 19: list index out of range


 21%|██        | 21/100 [01:37<04:17,  3.25s/it]

Error parsing evaluation for index 20: list index out of range


 22%|██▏       | 22/100 [01:45<06:06,  4.70s/it]

Error parsing evaluation for index 21: list index out of range


 23%|██▎       | 23/100 [01:48<05:21,  4.18s/it]

Error parsing evaluation for index 22: list index out of range


 24%|██▍       | 24/100 [01:51<04:55,  3.89s/it]

Error parsing evaluation for index 23: list index out of range


 25%|██▌       | 25/100 [01:56<05:11,  4.16s/it]

Error parsing evaluation for index 24: list index out of range


 26%|██▌       | 26/100 [02:00<04:57,  4.03s/it]

Error parsing evaluation for index 25: list index out of range


 27%|██▋       | 27/100 [02:06<05:34,  4.58s/it]

Error parsing evaluation for index 26: list index out of range


 28%|██▊       | 28/100 [02:09<05:16,  4.39s/it]

Error parsing evaluation for index 27: list index out of range


 29%|██▉       | 29/100 [02:14<05:11,  4.39s/it]

Error parsing evaluation for index 28: list index out of range


 30%|███       | 30/100 [02:18<04:53,  4.20s/it]

Error parsing evaluation for index 29: list index out of range


 31%|███       | 31/100 [02:21<04:26,  3.86s/it]

Error parsing evaluation for index 30: list index out of range


 32%|███▏      | 32/100 [02:28<05:26,  4.80s/it]

Error parsing evaluation for index 31: list index out of range


 33%|███▎      | 33/100 [02:30<04:31,  4.05s/it]

Error parsing evaluation for index 32: list index out of range


 34%|███▍      | 34/100 [02:34<04:17,  3.91s/it]

Error parsing evaluation for index 33: list index out of range


 35%|███▌      | 35/100 [02:37<03:56,  3.64s/it]

Error parsing evaluation for index 34: list index out of range


 36%|███▌      | 36/100 [02:39<03:39,  3.44s/it]

Error parsing evaluation for index 35: list index out of range


 37%|███▋      | 37/100 [02:43<03:36,  3.44s/it]

Error parsing evaluation for index 36: list index out of range


 38%|███▊      | 38/100 [02:50<04:41,  4.54s/it]

Error parsing evaluation for index 37: list index out of range


 39%|███▉      | 39/100 [02:54<04:21,  4.28s/it]

Error parsing evaluation for index 38: list index out of range


 40%|████      | 40/100 [03:00<04:59,  4.99s/it]

Error parsing evaluation for index 39: list index out of range


 41%|████      | 41/100 [03:04<04:33,  4.64s/it]

Error parsing evaluation for index 40: list index out of range


 42%|████▏     | 42/100 [03:07<03:54,  4.04s/it]

Error parsing evaluation for index 41: list index out of range


 43%|████▎     | 43/100 [03:10<03:35,  3.78s/it]

Error parsing evaluation for index 42: list index out of range


 44%|████▍     | 44/100 [03:14<03:39,  3.92s/it]

Error parsing evaluation for index 43: list index out of range


 45%|████▌     | 45/100 [03:20<04:08,  4.52s/it]

Error parsing evaluation for index 44: list index out of range


 46%|████▌     | 46/100 [03:24<03:54,  4.35s/it]

Error parsing evaluation for index 45: list index out of range


 47%|████▋     | 47/100 [03:28<03:37,  4.10s/it]

Error parsing evaluation for index 46: list index out of range


 48%|████▊     | 48/100 [03:30<03:08,  3.62s/it]

Error parsing evaluation for index 47: list index out of range


 49%|████▉     | 49/100 [03:35<03:24,  4.00s/it]

Error parsing evaluation for index 48: list index out of range


 50%|█████     | 50/100 [03:38<02:58,  3.58s/it]

Error parsing evaluation for index 49: list index out of range


 51%|█████     | 51/100 [03:41<02:48,  3.45s/it]

Error parsing evaluation for index 50: list index out of range


 52%|█████▏    | 52/100 [03:45<02:56,  3.68s/it]

Error parsing evaluation for index 51: list index out of range


 53%|█████▎    | 53/100 [03:49<03:00,  3.85s/it]

Error parsing evaluation for index 52: list index out of range


 54%|█████▍    | 54/100 [03:53<02:59,  3.91s/it]

Error parsing evaluation for index 53: list index out of range


 55%|█████▌    | 55/100 [04:01<03:41,  4.92s/it]

Error parsing evaluation for index 54: invalid literal for int() with base 10: 'GPT-4.0의 답변은 젖병의 청결 상태와 아기의 건강을 고려하여 정확한 정보를 제공하고 있습니다. 반면 Gemma2.0은 실제 위험을 감안하지 않고 경솔한 조언을 하고 있어 정확성이 낮습니다.'


 56%|█████▌    | 56/100 [04:03<02:58,  4.07s/it]

Error parsing evaluation for index 55: list index out of range


 57%|█████▋    | 57/100 [04:06<02:43,  3.81s/it]

Error parsing evaluation for index 56: list index out of range


 58%|█████▊    | 58/100 [04:10<02:38,  3.78s/it]

Error parsing evaluation for index 57: list index out of range


 59%|█████▉    | 59/100 [04:12<02:24,  3.52s/it]

Error parsing evaluation for index 58: list index out of range


 60%|██████    | 60/100 [04:19<02:55,  4.38s/it]

Error parsing evaluation for index 59: list index out of range


 61%|██████    | 61/100 [04:22<02:38,  4.07s/it]

Error parsing evaluation for index 60: list index out of range


 62%|██████▏   | 62/100 [04:25<02:23,  3.77s/it]

Error parsing evaluation for index 61: list index out of range


 63%|██████▎   | 63/100 [04:27<01:59,  3.22s/it]

Error parsing evaluation for index 62: list index out of range


 64%|██████▍   | 64/100 [04:30<01:51,  3.11s/it]

Error parsing evaluation for index 63: list index out of range


 65%|██████▌   | 65/100 [04:33<01:50,  3.15s/it]

Error parsing evaluation for index 64: list index out of range


 66%|██████▌   | 66/100 [04:35<01:34,  2.78s/it]

Error parsing evaluation for index 65: list index out of range


 67%|██████▋   | 67/100 [04:38<01:32,  2.81s/it]

Error parsing evaluation for index 66: list index out of range


 68%|██████▊   | 68/100 [04:41<01:34,  2.94s/it]

Error parsing evaluation for index 67: list index out of range


 69%|██████▉   | 69/100 [04:45<01:38,  3.17s/it]

Error parsing evaluation for index 68: list index out of range


 70%|███████   | 70/100 [04:54<02:29,  4.98s/it]

Error parsing evaluation for index 69: list index out of range


 71%|███████   | 71/100 [04:57<02:07,  4.39s/it]

Error parsing evaluation for index 70: list index out of range


 72%|███████▏  | 72/100 [05:00<01:52,  4.02s/it]

Error parsing evaluation for index 71: list index out of range


 73%|███████▎  | 73/100 [05:05<01:56,  4.33s/it]

Error parsing evaluation for index 72: list index out of range


 74%|███████▍  | 74/100 [05:09<01:46,  4.11s/it]

Error parsing evaluation for index 73: list index out of range


 75%|███████▌  | 75/100 [05:12<01:36,  3.88s/it]

Error parsing evaluation for index 74: list index out of range


 76%|███████▌  | 76/100 [05:16<01:27,  3.67s/it]

Error parsing evaluation for index 75: list index out of range


 77%|███████▋  | 77/100 [05:19<01:22,  3.60s/it]

Error parsing evaluation for index 76: list index out of range


 78%|███████▊  | 78/100 [05:22<01:15,  3.43s/it]

Error parsing evaluation for index 77: list index out of range


 79%|███████▉  | 79/100 [05:29<01:35,  4.56s/it]

Error parsing evaluation for index 78: list index out of range


 80%|████████  | 80/100 [05:33<01:26,  4.32s/it]

Error parsing evaluation for index 79: list index out of range


 81%|████████  | 81/100 [05:41<01:42,  5.41s/it]

Error parsing evaluation for index 80: invalid literal for int() with base 10: '8 Gemma2.0'


 82%|████████▏ | 82/100 [05:44<01:24,  4.71s/it]

Error parsing evaluation for index 81: list index out of range


 83%|████████▎ | 83/100 [05:48<01:14,  4.40s/it]

Error parsing evaluation for index 82: list index out of range


 84%|████████▍ | 84/100 [05:56<01:28,  5.55s/it]

Error parsing evaluation for index 83: list index out of range


 85%|████████▌ | 85/100 [05:59<01:13,  4.88s/it]

Error parsing evaluation for index 84: list index out of range


 86%|████████▌ | 86/100 [06:03<01:05,  4.69s/it]

Error parsing evaluation for index 85: list index out of range


 87%|████████▋ | 87/100 [06:07<00:57,  4.43s/it]

Error parsing evaluation for index 86: list index out of range


 88%|████████▊ | 88/100 [06:10<00:47,  3.97s/it]

Error parsing evaluation for index 87: list index out of range


 89%|████████▉ | 89/100 [06:12<00:36,  3.31s/it]

Error parsing evaluation for index 88: list index out of range


 90%|█████████ | 90/100 [06:20<00:47,  4.75s/it]

Error parsing evaluation for index 89: list index out of range


 91%|█████████ | 91/100 [06:25<00:42,  4.71s/it]

Error parsing evaluation for index 90: list index out of range


 92%|█████████▏| 92/100 [06:28<00:34,  4.28s/it]

Error parsing evaluation for index 91: list index out of range


 93%|█████████▎| 93/100 [06:37<00:40,  5.80s/it]

Error parsing evaluation for index 92: list index out of range


 94%|█████████▍| 94/100 [06:41<00:30,  5.15s/it]

Error parsing evaluation for index 93: list index out of range


 95%|█████████▌| 95/100 [06:44<00:22,  4.52s/it]

Error parsing evaluation for index 94: list index out of range


 96%|█████████▌| 96/100 [06:48<00:17,  4.29s/it]

Error parsing evaluation for index 95: list index out of range


 97%|█████████▋| 97/100 [06:53<00:13,  4.58s/it]

Error parsing evaluation for index 96: list index out of range


 98%|█████████▊| 98/100 [06:55<00:07,  3.87s/it]

Error parsing evaluation for index 97: list index out of range


 99%|█████████▉| 99/100 [06:59<00:03,  3.85s/it]

Error parsing evaluation for index 98: list index out of range


100%|██████████| 100/100 [07:03<00:00,  4.24s/it]

Error parsing evaluation for index 99: list index out of range

평가 완료. 평가 결과는 'qa_evaluation_results.csv' 파일에 저장되었습니다.
정확성 (평균): 0.09
관련성 (평균): 0.10
완전성 (평균): 0.10
명확성 (평균): 0.09



