In [None]:
import json
import pandas as pd
from tqdm import tqdm

task_type = "clustering"
file_name = "qwen_answer_" + task_type + "_set4.json"
with open(file_name, 'r', encoding='utf-8') as f:
    final_answer = json.load(f)

In [2]:
df_new_spotlight = pd.read_csv("df_new_clustering.csv")

In [3]:
import ast

for idx in range(len(df_new_spotlight)):
    df_new_spotlight["doc"][idx] = ast.literal_eval(df_new_spotlight["doc"][idx])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_new_spotlight["doc"][idx] = ast.literal_eval(df_new_spotlight["doc"][idx])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [None]:
# parse for the legal answer
import re
def extract_numbers(text):
    """提取文本中的所有数字"""
    numbers = re.findall(r'\d+', text)
    return [int(num) for num in numbers]

if task_type == "spotlight" or task_type == "comparison":
    for idx in range(len(df_new_spotlight)):
        type_query = df_new_spotlight["type"][idx]
        if type_query == "legal":
            answer = df_new_spotlight["answer"][idx]
            doc_lst = df_new_spotlight["doc"][idx]
            num_lst = extract_numbers(answer)
            answer_lst = []
            for num in num_lst:
                answer_lst.append(doc_lst[num-1])
            df_new_spotlight["answer"][idx] = answer_lst


In [5]:
instruction = 'We would like to request your feedback on the performance of the AI assistant in response to the user question displayed above according to the gold answer. Please use the following listed aspects and their descriptions as evaluation criteria: \n - Accuracy and Hallucinations: The assistant’s answer is semantically consistent with the gold answer; The numerical value and order need to be accurate, and there should be no hallucinations. \n- Completeness: Referring to the reference answers, the assistant’s answer should contain all the key points needed to answer the user’s question; further elaboration on these key points can be omitted. Please rate whether this answer is suitable for the question. Please note that the gold answer can be considered as a correct answer to the question. The assistant receives an overall score on a scale of 1 to 100, where a higher score indicates better overall performance.Please note that if the assistant’s answer and the gold answer fully meet the above criteria, its overall rating should be the full marks (100). Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias.Then, output a line indicating the score of the Assistant. PLEASE OUTPUT WITH THE FOLLOWING FORMAT, WHERE THE SCORE IS A SCALE OF 1 TO 100 BY STRICTLY FOLLOWING THIS FORMAT: "[[score]]", FOR EXAMPLE "Rating: [[100]]": \n <Start Output>\nEvaluation evidence: your evluation explanation here, no more than 100 words Rating: [[score]]\n<End Output>\nNow, start your evaluation:'

In [6]:
import requests
def get_gpt_response(query):
    """发送最终提问，并获取 GPT 的回复"""
    conversation_history.append({"role": "user", "content": query})  # 加入提问
    
    data = {
        "model": "gpt-4",
        "messages": conversation_history,  # 传递完整的对话历史
        "temperature": 0
    }
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response_json = response.json()
        
        if "choices" in response_json:
            answer = response_json["choices"][0]["message"]["content"]
            conversation_history.append({"role": "assistant", "content": answer})
            print("> Answer: ", answer)
            return answer
        else:
            print("⚠️ API 响应异常:", response_json)
            return "API_ERROR"
    
    except requests.exceptions.RequestException as e:
        print(f"⚠️ 请求异常: {e}")
        return "REQUEST_ERROR"
    except json.JSONDecodeError:
        print("⚠️ JSON 解析错误")
        return "JSON_ERROR"



url = "https://gpt-api.hkust-gz.edu.cn/v1/chat/completions"
headers = { 
    "Content-Type": "application/json", 
    "Authorization": "67d8c990fa5e43f0b4607a6a23c84db82b5f66691dbe42e0bd81f12a199abc26"
}

# 存储对话历史
conversation_history = [
    {"role": "system", "content": "你是一个有帮助的助手。"}  # 设定初始角色
]

def add_context(text):
    """向 GPT 添加多段长文本作为上下文"""
    conversation_history.append({"role": "user", "content": text})

In [11]:
df_new_spotlight_set1 = df_new_spotlight[df_new_spotlight["set"] == 4].reset_index(False)

In [12]:
len(final_answer), len(df_new_spotlight_set1)

(88, 88)

In [None]:
score = []
for idx in range(0, len(df_new_spotlight_set1)):
    conversation_history.clear()  # 清空对话历史
    question = df_new_spotlight_set1["question"][idx]
    gold_answer = str(df_new_spotlight_set1["answer"][idx])
    type = df_new_spotlight_set1["type"][idx]
    llm_predict = final_answer[idx]
    add_context("[The given question]: " + question)
    if type == "legal":
        if task_type == "spotlight" or task_type == "comparison":
            add_context("[Gold Answer]: " + gold_answer)
        else:
            add_context("[Gold Answer]: " + gold_answer + " 其中序号对应的判决书列表如下： " + str(df_new_spotlight_set1["doc"][idx]))
    else:
        add_context("[Gold Answer]: " + gold_answer)
    add_context("[The start of Assistant's predicted Answer]\n" + llm_predict + "[The End of Assistant’s Predicted Answer]")
    response = get_gpt_response(instruction)
    score.append(response)

> Answer:  <Start Output>
Evaluation evidence: The assistant's answer is incomplete and inaccurate compared to the gold answer. It fails to list any references, whereas the gold answer includes five references. Additionally, the assistant's citation list is incorrect, missing two citations from the gold answer and including two that are not present in the gold answer. The assistant's response also contains hallucinations, as it lists citations that are not in the gold answer. Therefore, the assistant's response lacks accuracy and completeness. Rating: [[30]]
<End Output>


In [None]:
score_num = []
for output in score:
    score_num.append(extract_numbers(output)[-1])

In [None]:
def average(lst):
    return sum(lst) / len(lst) if lst else 0  # 避免除零错误

In [None]:
average(score_num)

71.7132867132867

In [None]:
def calculate_percentage(lst):
    count_100 = lst.count(100)
    total = len(lst)
    percentage = (count_100 / total) * 100 if total > 0 else 0
    return f"{percentage:.2f}%"

print(calculate_percentage(score_num))


31.47%
