# 预处理数据 (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [3]:
!pip install datasets evaluate transformers[sentencepiece]



In [5]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

# mahy
print(f"batch is: \n{batch}")
for k in batch.keys():
  print(f"{k}:{batch[k].shape}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


batch is: 
{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1])}
input_ids:torch.Size([2, 16])
token_type_ids:torch.Size([2, 16])
attention_mask:torch.Size([2, 16])
labels:torch.Size([2])


In [6]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [7]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [8]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

print(
  "sentence1: ", type(raw_datasets["train"]["sentence1"]), len(raw_datasets["train"]["sentence1"][:2]), "\n",
  "sentence2: ", type(raw_datasets["train"]["sentence2"]), len(raw_datasets["train"]["sentence2"][:2]),
)

get_lens = lambda l: [len(x) for x in l]

print(
  "tokenized_sentences_1: ", get_lens(tokenized_sentences_1["input_ids"][:2]), "\n",
  "tokenized_sentences_2: ", get_lens(tokenized_sentences_2["input_ids"][:2]),
)

# raw_datasets["train"]["sentence1"] 包含 sentences1 的句子。
# 所以， tokenizer(raw_datasets["train"]["sentence1"]) 无法直接 padding,
# 也无法直接 to tensor.


sentence1:  <class 'list'> 2 
 sentence2:  <class 'list'> 2
tokenized_sentences_1:  [25, 27] 
 tokenized_sentences_2:  [26, 33]


In [14]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs
# 这里的 token_type_ids 并不是， 所有的 inputs 都有的。 而是，
# bert 专有的例子。

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [18]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)
# 1. 如果直接， 对 dataset 执行 padding， 则会出现,
#   each element in ths list, will be aligned to max length in the dataset.
# 2. 这里的 tokenized_dataset 是一个 Dict， 这并不是一个好事。
print(type(tokenized_dataset))
print(len(tokenized_dataset))
print(len(tokenized_dataset[list(tokenized_dataset.keys())[0]]))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
3
3668


In [36]:
# 如果需要使用， dataset.map 的方式进行调整，如何来搞呢？
# raw_dataset: DatasetDict
import random
def tokenized_example(example):
  # example represents a row or a batch of row
  # if row, example['sentence1] should be a sentence,
  # else if batch, example['sentence1'] should be a list of sentences.
  #
  # 返回结果， 应该对 每一个 sample, 增加 合适的 KEY。
  # 另外， 这里也不应该， 当前就执行 padding，
  # 因为， 实际在 dataloader 中， 会 先shuffle， 然后再 整合成 batch。
  # 所以， padding 操作， 只能在 collate 阶段搞。


  # map 中， 下面几行， 不可以添加， 否则会出现报错。
  # 这是因为， 列 的 增加、 删除 之类， 并没有被统一导致的。
  # num = random.choice([0, 1])
  # if num < 1:
  #   return {}

  token = tokenizer(
    # 这是该 TOKEN 预定义的用法
    example["sentence1"],  example["sentence2"],
    truncation=True,
  )
  #
  return token


# perform the dataset map
tokenized_dataset = raw_datasets.map(
  tokenized_example, batched=True
)
print(tokenized_dataset)

# remove columns
tokenized_dataset = tokenized_dataset.map(lambda x: {},  remove_columns = ['sentence1', 'sentence2'])
print(tokenized_dataset)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [26]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [27]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [29]:
# https://huggingface.co/docs/transformers/v4.49.0/zh/main_classes/data_collator
# 这里介绍了， 大概可以使用到的， DataCollator
from transformers import DataCollatorWithPadding

# 之所以， 这里会引入 tokenizer, 是因为， 会使用到 tokenizer.padding_token 之类。
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]
print(type(samples))

<class 'dict'>


In [34]:
batch = data_collator(samples)
# batch 仍然是 一个 Dict. 也就是说， data_collator 可以是可以填充 Dict 类型的。
# input:
#   samples: Dict[str, List[int]].
# Returns:
#   batch: Dict[str, torch.Tensor]
print(type(batch))
{k: v.shape for k, v in batch.items()}

<class 'transformers.tokenization_utils_base.BatchEncoding'>


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}