In [1]:
!pip install modelscope
from modelscope import snapshot_download
model_dir = snapshot_download('Xenova/bart-large-cnn')

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0mDownloading Model to directory: /root/.cache/modelscope/hub/Xenova/bart-large-cnn


In [2]:
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, model_dir=".cache/modelscope/hub/Xenova/bart-large-cnn"):
        """ 
        device: 运行设备 (cuda:0 / cpu)
        max_length: 最大序列长度
        model_dir: 本地模型路径，默认使用 modelscope 下载的路径
        """
        self.device = device
        self.max_length = max_length

        # 如果没有提供 model_dir，则先下载模型
        # if model_dir is None:
        #     model_dir = snapshot_download('Xenova/bart-large-cnn')

        # 从本地加载 tokenizer 和 model
        self.tokenizer = BartTokenizer.from_pretrained(model_dir)
        self.model = BartForConditionalGeneration.from_pretrained(model_dir)

        self.model.eval()
        self.model.to(device)

        # 设置损失函数
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def score(self, srcs, tgts, batch_size=16):
        """ 计算文本相似性得分 """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def test(self, batch_size=1):
        """ 测试方法 """
        src_list = ["Within the past decade, humans have watched in awe as technology has improved exponentially, and learned to take on tasks once thought only humans could accomplish. Technology has been taught to communicate, entertain, solve problems, along with many other functions, and now it may be able to teach as well. Though we have made use of every technological advancement in the past, this may be one that we should not capitalize on. Before taking this enormous step in technological advancement, humans must consider the flaws within this newly developed system, whether we are ready to replace teachers with computers. Though this new system is an astonishing feat in technology, people must recognize the flaws within this system as well. As described in paragraph 6, a student's facial expression has the power to change a lesson. If this is true, could a student not simply fake an emotion to get out of assignments or slack off by acting confused. The article claims that the software can pick up on fake smiles, but can it detect false expressions for other emotions as well? Another flaw within this system is the emotions the technology is capable of perceiving, paragraph 6 discusses the modification of a lesson based on boredom or confusion detected by the computer, however, neither of these two feelings are listed in the 6 emotions that the software is capable of detecting (paragraph 3). These two major flaws within the software lead to only one conclusion, the technology is simply not advanced enough to take on the role of a teacher. Humans must also consider this issue from an ethical standpoint, are we ready to take away the role of teachers and replace them with software? Though machines have already taken away menial jobs, such as factory work, teaching is a career which requires a four year college degree and a license. Countless people aspire to become teachers, are we ready to take that away? Even if teachers were to work hand in hand with this technology there would still be"]

        tgt_list = ['Discuss the potential flaws and ethical dilemmas associated with the integration of emotion detecting software in the classroom, and argue whether or not humans are ready to replace teachers with computers.']
        
        score=self.score(src_list, tgt_list, batch_size)
        print(score)
        hum['prompt score']=score


# 运行测试代码
scorer = BARTScorer(device="cuda:0")  # 如果没有 GPU，可改为 "cpu"
scorer.test(batch_size=3)


[-2.3933444023132324]


NameError: name 'hum' is not defined