1. 编码器的使用

In [1]:
from transformers import BertTokenizer

# 加载分词器
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-chinese',
    cache_dir=None,
    force_download=False,
)

# 待编码的句子
sents = [
    '选择珠江花园的原因就是方便。',
    '笔记本的键盘确实爽。',
    '房间太小。其他的都一般。',
    '今天才知道这书还有第6卷,真有点郁闷.',
    '机器背面似乎被撕了张什么标签，残胶还在。',
]

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
input_ids = tokenizer.encode(
    text=sents[0], # 句子1
    text_pair=sents[1], # 句子2
    truncation=True, # 当句子长度大于max_length时,截断
    padding='max_length', # 一律补pad到max_length长度
    add_special_tokens=True,
    max_length=30, # 最大长度
    return_tensors=None, # 返回list
)

In [9]:
print('编码之后的句子:\n',input_ids)
print('被解码:\n',tokenizer.decode(input_ids))

编码之后的句子:
 [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
被解码:
 [CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]


In [None]:
# 增强编码
# encode_plus 自然语言处理的函数，通过用于语言模型预训练，将文本转换为模型可以理解的输入
out = tokenizer.encode_plus(
    text=sents[0], # 句子1
    text_pair=sents[1], # 句子2
    truncation=True, # 当句子长度大于max_length时,截断
    padding='max_length', # 一律补零到max_length长度
    max_length=30, # 最大长度
    add_special_tokens=True,
    return_tensors=None, # 可取值tf,pt,np,默认为返回list
    return_token_type_ids=True, # 返回token_type_ids
    return_attention_mask=True, # 返回attention_mask
    return_special_tokens_mask=True, # 返回special_tokens_mask 特殊符号标识
    #返回offset_mapping 标识每个词的起止位置,这个参数只能BertTokenizerFast使用
    #return_offsets_mapping=True,
    #返回length 标识长度
    return_length=True,
)

In [None]:
for k, v in out.items():
    print(k, ':', v)

# input_id 编码后的词
# token_type_ids 第一个句子和特殊符号的位置是 0，第二个句子的位置是 1
# special_tokens_mask 特殊符号的位置
# attention_mask pad的位置是 0，其他位置是 1
# length 返回句子的长度

input_ids : [101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 511, 102, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]
token_type_ids : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
special_tokens_mask : [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
attention_mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
length : 30


In [12]:
tokenizer.decode(out['input_ids'])

'[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 。 [SEP] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]'

In [15]:
# 批量增强编码
# tokenizer.batch_encode_plus() 用于批量编码

out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs=[sents[0], sents[1]], # 编码两个句子
    add_special_tokens=True, # 特殊符号的位置是 1，其他位置是 0
    truncation=True, # 当句子大于 max_length 时，截断
    padding='max_length', # 一律补零到 max_length 长度
    max_length=15, # 最大长度
    return_tensors=None, # 可取值 tf, pt, np，默认为返回 list
    return_token_type_ids=True, 
    return_attention_mask=True, 
    return_special_tokens_mask=True,
    return_length=True,
)

for k, v in out.items():
    print(k, ':', v)
    
print(tokenizer.decode(out['input_ids'][0]))
print(tokenizer.decode(out['input_ids'][1]))

input_ids : [[101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221, 3175, 912, 102], [101, 5011, 6381, 3315, 4638, 7241, 4669, 4802, 2141, 4272, 511, 102, 0, 0, 0]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
special_tokens_mask : [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]
length : [15, 12]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
[CLS] 选 择 珠 江 花 园 的 原 因 就 是 方 便 [SEP]
[CLS] 笔 记 本 的 键 盘 确 实 爽 。 [SEP] [PAD] [PAD] [PAD]


In [19]:
# 获取字典
zidian = tokenizer.get_vocab()
print(type(zidian), 'yueguan' in zidian)
tokenizer.add_tokens(new_tokens=['yueguan'])
tokenizer.add_special_tokens({'eos_token': '[EOS]'})
zidian = tokenizer.get_vocab()
print(len(zidian), zidian['yueguan'], zidian['[EOS]'])

<class 'dict'> True
21130 21128 21129


In [20]:
out = tokenizer.encode(
    text = 'yueguan[EOS]',
    text_pair = None,
    truncation = True,
    padding = 'max_length',
    max_length = 8,
    return_tensors = None,
)
print(out)
print(tokenizer.decode(out))

[101, 21128, 21129, 102, 0, 0, 0, 0]
[CLS] yueguan [EOS] [SEP] [PAD] [PAD] [PAD] [PAD]


In [None]:
# # 下载整个模型的所有文件

# from huggingface_hub import snapshot_download

# local_dir = snapshot_download(
#     repo_id="google-bert/bert-base-uncased", 
#     local_dir="bert-base-uncased",
#     force_download=True
#     )

# print(f"Model directory downloaded to: {local_dir}")

Fetching 16 files: 100%|██████████| 16/16 [00:51<00:00,  3.23s/it]

Model directory downloaded to: /home/jovyan/my_code/LLM_Fine_Tuning_Molecular_Properties/bert-base-uncased





2. 加载并使用transformer 预训练模型

In [None]:
# 加载预训练模型
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained("./bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("./bert-base-uncased")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [None]:
# 使用预训练模型进行 NLP 任务
# 例如: 文本分类
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained("./bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("./bert-base-uncased")

2024-10-31 08:31:36.338402: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-31 08:31:36.343167: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-31 08:31:36.351293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730363496.365227   93714 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730363496.369315   93714 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regist

In [5]:
import torch
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
prediction = torch.argmax(logits).item()
print(prediction)

0
