### 1. 引入依赖

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import pandas as pd
from tqdm import tqdm

### 2. 加载模型

In [2]:
model_name = "/root/autodl-tmp/outputs/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

### 3. 情感分析

In [3]:
def get_sentiment(text):
    """
        调用大模型进行情感分析
    """
    sys_prompt = """
    你是一个情感识别专家！请对用户输入的酒店评论做情感分析！
    如果是正面的评论，请输出：
    {
        "sentiment": "正面"
    }，
    如果是负面的评论，请输出：
    {
        "sentiment": "负面"
    }
    """
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": text}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [4]:
data = pd.read_csv(filepath_or_buffer="test.tsv",
                  sep="\t",
                  encoding="utf8")

In [5]:
num_samples, num_features = data.shape

In [6]:
results = []
num_errors = 0
for idx in tqdm(range(num_samples)):
    sample = data.loc[idx, :]
    comment = sample["评论"]
    sentiment = sample["情感"]
    result = get_sentiment(text=comment)
    try:
        result = json.loads(s=result)
    except Exception as ex:
        print(f"json 解析错误：{comment}")
        num_errors += 1
        continue
    # print(f"-----------------{idx}----------------------")
    # print(result)
    # print(sentiment)
    results.append(result["sentiment"] == sentiment)

100%|██████████| 1500/1500 [03:32<00:00,  7.06it/s]


In [7]:
# from tqdm import tqdm
acc = sum(results) / len(results)
acc

0.9173333333333333

In [8]:
# 格式错误
num_errors

0