In [1]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

## tokenizer
- tokenizer 工作原理
    - tokenizer = AutoTokenizer.from_pretrained(model_name)
    - tokenizer.vocab: 字典，储存了token和id的映射关系
        - tokenizer.special_tokens_map
    - tokenizer.encode()
    - tokenizer.decode()
    - tokenizer.tokenize + tokenizer.convert_id_to_vocab


In [None]:
# tokenizer的基础调用
# 句子，可以传出单个句子，也可以批量（batch size）以列表输入
Sentences = [
    "today is not a bad day.",
    "today is so bad.",
]
# 指定model
model_name = "/root/autodl-fs/distilbert/distilbert-base-uncased"

# 实例化tokenizer和model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 实现分词，并将token转换为input_ids
batch_inputs = tokenizer(Sentences,truncation=True,padding=True,return_tensors="pt")

In [4]:
model_name = "/root/autodl-fs/distilbert/distilbert-base-uncased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /root/autodl-fs/distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
batch_inputs = tokenizer(Sentences,truncation=True,padding=True,return_tensors="pt")

In [17]:
tokenizer.decode(102)

'[SEP]'

In [10]:
batch_inputs

{'input_ids': tensor([[ 101, 2651, 2003, 2025, 1037, 2919, 2154, 1012,  102],
        [ 101, 2651, 2003, 2061, 2919, 1012,  102,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0]])}

In [45]:
# 把Sentences和batch_inputs进行对比
Sentences = [
    "today is good.",
    "today is so bad.",
]

[
    [ 101, 2651, 2003, 2025, 1037, 2919, 2154, 1012,  102],
    [ 101, 2651, 2003, 2061, 2919, 1012,  102,    0,    0]
]


[[101, 2651, 2003, 2025, 1037, 2919, 2154, 1012, 102],
 [101, 2651, 2003, 2061, 2919, 1012, 102, 0, 0]]

In [18]:
model(**batch_inputs)


SequenceClassifierOutput(loss=None, logits=tensor([[-0.0185,  0.1882],
        [-0.0411,  0.1832]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [21]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("today is not a bad day."))

[2651, 2003, 2025, 1037, 2919, 2154, 1012]

In [19]:
tokenizer.convert_ids_to_tokens([ 101, 2651, 2003, 2025, 1037, 2919, 2154, 1012,  102])

['[CLS]', 'today', 'is', 'not', 'a', 'bad', 'day', '.', '[SEP]']

In [24]:
tokenizer.vocab["today"]

2651

In [27]:
# 通过键来找值
vocab = tokenizer.vocab
vocab.get("today")

2651

In [32]:
# 通过值来找键
list(vocab.keys())[list(vocab.values()).index(2651)]

'today'

## 通过值来找键（1）
- 转换为列表，结构：[key1,key2,key3,...keyN][[value1,value2,value3,...valueN].index(id)]
## 通过值来找键（2）
- k for k, v in dict.items() if v == value



In [29]:
student = {'小萌': '1001', '小智': '1002', '小强': '1003', '小明': '1004'}
list (student.keys()) 

['小萌', '小智', '小强', '小明']

In [30]:
[list (student.values()).index ('1004')]

[3]

In [33]:
def get_key_from_value(dic,value):
    for k,v in dic.items():
        if v == value:
            return k

In [34]:
get_key_from_value(vocab,2651)

'today'

## Model 模型调用

In [35]:
import torch
import torch.nn.functional as F

In [49]:
process_types = ["Positive", "Negative"]
id2label = {i:label for i,label in enumerate(process_types)}
label2id = {label:i for i,label in enumerate(process_types)}

In [50]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /root/autodl-fs/distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
model.config

DistilBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/autodl-fs/distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Positive",
    "1": "Negative"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Negative": 1,
    "Positive": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "vocab_size": 30522
}

In [52]:
with torch.no_grad():
    outputs = model(**batch_inputs)
    print(outputs)
    scores = F.softmax(outputs.logits,dim=-1)
    print(scores)
    labels = torch.argmax(scores,dim=-1)
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0528, -0.0186],
        [ 0.0729, -0.0407]]), hidden_states=None, attentions=None)
tensor([[0.5178, 0.4822],
        [0.5284, 0.4716]])
tensor([0, 0])
