In [None]:
import pandas as pd
import numpy as np
import json

import torch
import transformers

from concurrent.futures import ThreadPoolExecutor
import requests

import os
from openai import OpenAI

pd.set_option('display.max_columns', None)

from collections import defaultdict
from rouge import Rouge
import spacy
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
nlp = spacy.load('zh_core_web_sm')
import nltk
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.nist_score import sentence_nist
from bert_score import BERTScorer,score
from sacrebleu.metrics import TER
import textstat

In [None]:
def calculate_embedding_scores(df, nlp):

    embedding_avg_scores = {f'{col}-Embedding Average score': [] for col in ['1.5B-方法1', '1.5B-方法2', '7B-方法1','7B-方法2', '72B-方法1', '72B-方法2']}
    for index, row in df.iterrows():
        reference_embedding = np.mean([token.vector for token in nlp(row['标准答案'])], axis=0)
        for col in ['1.5B-方法1', '1.5B-方法2','7B-方法1', '7B-方法2', '72B-方法1', '72B-方法2']:           
            embedding_avg_similarity = cosine_similarity([reference_embedding], [candidate_embedding])[0][0]
            embedding_avg_scores[f'{col}-Embedding Average score'].append(embedding_avg_similarity)
    return embedding_avg_scores

embedding_avg_scores = calculate_embedding_scores(data_2, nlp)

In [None]:
def calculate_bert_scores(df):
    bertscorer = BERTScorer(lang="zh", rescale_with_baseline=True)
    bert_scores.update({f'{col}-BERT': [] for col in ['1.5B-方法1', '1.5B-方法2','7B-方法1', '7B-方法2', '72B-方法1', '72B-方法2']})
    for index, row in df.iterrows():
        reference = row['标准答案']
        for col in ['1.5B-方法1', '1.5B-方法2','7B-方法1', '7B-方法2', '72B-方法1', '72B-方法2']:
            candidate = row[col]
            P, R, F1 = bertscorer.score([candidate], [reference])
            bert_scores[f'{col}-BERT召'].append(R.item())
    return bert_scores

In [None]:
def calculate_bert_scores(df):
    bertscorer = BERTScorer(lang="zh", rescale_with_baseline=True)
    bert_scores = {
        'AI答案-BERT': [],
    }
    for index, row in df.iterrows():
        reference = row['标准答案'] 
        candidate = row['概率-原始输出']
        _,R,_ = bertscorer.score([candidate], [reference])
        bert_scores['AI答案-BERT'].append(R.item())
    return bert_scores
bert_scores = calculate_bert_scores(data)

In [None]:
def calculate_embedding_scores(df, nlp):
    embedding_avg_scores = []
    for index, row in df.iterrows():
        reference_embedding = np.mean([token.vector for token in nlp(row['标准答案'])], axis=0)
        candidate_embedding = np.mean([token.vector for token in nlp(row['概率-原始输出'])], axis=0)
        embedding_avg_similarity = cosine_similarity([reference_embedding], [candidate_embedding])[0][0]
        embedding_avg_scores.append(embedding_avg_similarity)
    return embedding_avg_scores
embedding_avg_scores = calculate_embedding_scores(data, nlp)

In [None]:
def calculate_bert_scores(df):
    bertscorer = BERTScorer(lang="zh", rescale_with_baseline=True)
    bert_scores = {
        'AI答案-BERT': [],
    }
    for index, row in df.iterrows():
        reference = row['标准答案'] 
        candidate = row['结果-LLM'] 
        _, R, _ = bertscorer.score([candidate], [reference])
        bert_scores['AI答案-BERT'].append(R.item())
    return bert_scores
bert_scores = calculate_bert_scores(data)

In [None]:
def calculate_embedding_scores(df, nlp):
    embedding_avg_scores = []
    for index, row in df.iterrows():
        reference_embedding = np.mean([token.vector for token in nlp(row['标准答案'])], axis=0)
        candidate_embedding = np.mean([token.vector for token in nlp(row['结果-LLM'])], axis=0)
        embedding_avg_similarity = cosine_similarity([reference_embedding], [candidate_embedding])[0][0]
        embedding_avg_scores.append(embedding_avg_similarity)
    return embedding_avg_scores
embedding_avg_scores = calculate_embedding_scores(data, nlp)

In [None]:
client = OpenAI(
    api_key="***",
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
results = []
for index, row in data.iterrows():
    question = row['问题']
    try:
        completion = client.chat.completions.create(
            model="qwen3-32b",
            messages=[
                {"role": "system", "content": "作为消化科医生，请用精简的中文回答，不要透露AI身份。直接给出结论，不要重复问题。请严格地以连续的自然段形式作答，不要分条列举。输出生成的内容前，请仔细检查文本内容，禁止使用###、##、#、***、**、*、 这样的格式标记内容，禁止重点标记、标题、空格。禁止回复答案来源，例如‘根据xxx研究’、‘根据搜索材料’、‘根据提供的信息’、‘基于提供的资料’之类的话术"
            }},
                {"role": "user", "content": question},
            ],
            extra_body={"enable_thinking": False},
        )
        results.append(completion.model_dump_json())
    except Exception as e:
        results.append(f"Error processing question {question}: {e}")

In [None]:
replies = []
for result in results:
    try:
        data = json.loads(result)
        reply = data['choices'][0]['message']['content']
        replies.append(reply)
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error processing result: {e}")
        replies.append("Error extracting content")