# Pytorch Tutorial

このチュートリアルは以下の2つの内容を含みます.


1.   BERTを使った含意分類モデルのfine tuning
2.   BERT2BERTを用いた含意文生成モデルのfine tuning



In [7]:
!pip install transformers
!pip install datasets
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 7.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

# Imports

In [8]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EncoderDecoderModel,
    Seq2SeqTrainer,     
    Seq2SeqTrainingArguments
) 
import transformers
import torch
from tqdm import tqdm
from datasets import load_dataset
import random
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# Avoid load model warnings
import logging
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

# Setup

In [9]:
""" optional settings
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['TRANSFORMERS_CACHE'] = "/home/is/kai-yo/work3/cache_transformers"
os.environ['HF_DATASETS_CACHE'] = "/home/is/kai-yo/work3/cache_hf_datasets"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"
"""
CUDA_AVAILABLE = False
if torch.cuda.is_available():
    CUDA_AVAILABLE = True
    print("CUDA IS AVAILABLE")
else:
    print("CUDA NOT AVAILABLE")
#device = torch.device('cpu')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

CUDA IS AVAILABLE


# 分類モデルの学習

In [10]:
def tokenize(batch):
    return tokenizer(batch["premise"], batch["hypothesis"], padding="max_length", truncation=True, max_length=256)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
BATCH_SIZE = 8
MAX_LENGTH = 128
NUM_EPOCHS = 2

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
raw_datasets = load_dataset("multi_nli")
tokenized_datasets = raw_datasets.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    ['promptID','pairID', 'premise','premise_binary_parse','premise_parse', 'hypothesis','hypothesis_binary_parse', 'hypothesis_parse','genre']
)
tokenized_datasets.set_format("torch")

train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["validation_matched"]
train_dataset, test_dataset

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset multi_nli/default (download: 216.34 MiB, generated: 410.92 MiB, post-processed: Unknown size, total: 627.27 MiB) to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39...


Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Dataset multi_nli downloaded and prepared to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/393 [00:00<?, ?ba/s]

# train dataの数を減らす処理を書く

In [28]:
model.train()
training_args = TrainingArguments(
    output_dir="./",          # 出力フォルダ
    num_train_epochs=NUM_EPOCHS,              # エポック数
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_train=True,
    warmup_steps=1000,
    weight_decay=0.01,
    #evaluate_during_training=True,
    logging_dir='./outputs/models/logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 4
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2


KeyError: ignored

# 生成モデルの学習

In [None]:
def logits_to_label(pred):
    pred = pred.logits.softmax(dim=-1).tolist()[0]
    label = 0 if pred[0]>pred[1] else 1
    return label

def concat(personas:list):
    concated = "[CLS]"
    for persona in personas:
        concated += " " + persona + " [SEP]"
    return concated

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [29]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
generation_model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

In [38]:
generation_datasets = raw_datasets.filter(lambda x:x["label"]==1)
generation_datasets["train"][0]

  0%|          | 0/393 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

{'genre': 'government',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )',
 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))',
 'label': 1,
 'pairID': '31193n',
 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )',
 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))',
 'promptID': 31193}

In [39]:
generation_datasets = generation_datasets.map(tokenize, batched=True)
generation_datasets = generation_datasets.remove_columns(
    ['promptID','pairID', 'premise','premise_binary_parse','premise_parse', 'hypothesis','hypothesis_binary_parse', 'hypothesis_parse','genre']
)
generation_datasets.set_format("torch")

train_dataset_for_generation = tokenized_datasets["train"]
test_dataset_for_generation = tokenized_datasets["validation_matched"]
train_dataset_for_generation[0], test_dataset_for_generation[0]

  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

(Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 392702
 }), Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 9815
 }))

In [40]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 100
model.config.min_length = 20
model.config.no_repeat_ngram_size = 1
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 20


# In[9]:


rouge = datasets.load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

ImportError: ignored

In [45]:
# Train Param
batch_size = 8
model.train()
# https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./',
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=10,
    save_steps=1000,
    eval_steps=5000,
    warmup_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=5,
    fp16=False,
    num_train_epochs=3,
    no_cuda=not CUDA_AVAILABLE
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset_for_generation,
    eval_dataset=test_dataset_for_generation
)
trainer.train()

"""
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python train_bert.py 
jupyter nbconvert --to script train_bert.ipynb
"""



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 392702
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 147264
The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.


IndexError: ignored

In [None]:
generation_model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints

In [None]:
inputs = tokenizer.batch_encode_plus(
    df_input["r"].tolist(),
    return_tensors="pt", 
    add_special_tokens=True,
    truncation=True,
    padding="max_length",
    max_length=256
    )
    
labels = tokenizer.batch_encode_plus(
    df_input["concated"].tolist(),
    return_tensors="pt", 
    add_special_tokens=False,
    truncation=True,
    padding="max_length",
    max_length=256
    )
train_data = []
for i in range(len(inputs["input_ids"])):
    train_data.append(
        {
            "input_ids":inputs["input_ids"][i],
            "token_type_ids":inputs["token_type_ids"][i],
            "attention_mask":inputs["attention_mask"][i],
            "label":labels["input_ids"][i] 
        }
    )
random.shuffle(train_data)
train_size = int(len(train_data)*0.98)
eval_data = train_data[train_size:]
train_data = train_data[:train_size]


In [None]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 100
model.config.min_length = 20
model.config.no_repeat_ngram_size = 1
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 20