使用 transformers 库进行文本分类,只指定模型名字，获取默认的模型。

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
classifier("你好啊")

In [None]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

使用多语言支持的模型进行分类，指定模型名字，获取模型和tokenizer.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# 准备不同语言的测试文本
texts = [
    ("英语", "I am very happy to show you the 🤗 Transformers library."),
    ("中文", "我很高兴能使用Transformers库来处理自然语言。"),
    ("法语", "Je suis très heureux de vous présenter la bibliothèque Transformers."),
    ("日语", "Transformersライブラリを使用できて本当に嬉しいです。"),
    ("德语", "Ich bin sehr glücklich, Ihnen die Transformers-Bibliothek zu zeigen."),
    ("西班牙语", "Estoy muy feliz de mostrarles la biblioteca Transformers."),
    ("阿拉伯语", "أنا سعيد جدا لأظهر لكم مكتبة المحولات.")
]

# 提取文本列表
text_list = [text for _, text in texts]

# 使用tokenizer进行批量编码
pt_batch = tokenizer(text_list,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)
# 分析每种语言的文本并打印结果
for i, (language, text) in enumerate(texts):
    result = classifier(text)[0]
    print(f"语言: {language}")
    print(f"文本: {text}")
    print(f"编码: {pt_batch['input_ids'][i].tolist()}")
    print(f"情感分析结果: {result['label']}")
    print(f"置信度: {result['score']:.4f}\n")
    

使用 auto model


In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")

pt_outputs = pt_model(**pt_batch)

from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)


SequenceClassifierOutput(loss=None, logits=tensor([[-2.5757, -2.7271, -0.9195,  1.9724,  3.2714],
        [-1.9059, -1.8607, -0.5966,  1.1100,  2.5102],
        [-2.3211, -2.4435, -0.5569,  1.6607,  2.8887],
        [-2.3028, -1.7910,  0.1935,  1.5138,  1.7481],
        [-2.2007, -2.4555, -0.9112,  1.6622,  3.0401],
        [-2.2117, -2.4682, -0.6222,  1.7106,  2.8585],
        [-0.7225, -0.3669,  0.4493,  0.4018,  0.1423]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
