<a href="https://colab.research.google.com/github/Exe-dev/M1-Pytorch-Tutorial/blob/main/Pytorch_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pytorch Tutorial

このチュートリアルは以下の2つの内容を含みます.


1.   BERTを使った含意分類モデルのfine tuning
2.   BERT2BERTを用いた含意文生成モデルのfine tuning



In [50]:
!pip install transformers
!pip install datasets
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Imports

In [51]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EncoderDecoderModel,
    Seq2SeqTrainer,     
    Seq2SeqTrainingArguments
) 
import transformers
import torch
from tqdm import tqdm
from datasets import load_dataset
import random
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd
# Avoid load model warnings
import logging
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

# Setup

In [52]:
""" optional settings
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['TRANSFORMERS_CACHE'] = "/home/is/kai-yo/work3/cache_transformers"
os.environ['HF_DATASETS_CACHE'] = "/home/is/kai-yo/work3/cache_hf_datasets"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"
"""
CUDA_AVAILABLE = False
if torch.cuda.is_available():
    CUDA_AVAILABLE = True
    print("CUDA IS AVAILABLE")
else:
    print("CUDA NOT AVAILABLE")
#device = torch.device('cpu')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

CUDA IS AVAILABLE


# 分類モデルの学習

In [53]:
def tokenize(batch):
    return tokenizer(batch["premise"], batch["hypothesis"], padding="max_length", truncation=True, max_length=256)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [54]:
BATCH_SIZE = 8
MAX_LENGTH = 128
NUM_EPOCHS = 2

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

In [57]:
raw_datasets = load_dataset("multi_nli")
tokenized_datasets = raw_datasets.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    ['promptID','pairID', 'premise','premise_binary_parse','premise_parse', 'hypothesis','hypothesis_binary_parse', 'hypothesis_parse','genre']
)


train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["validation_matched"]
train_dataset, test_dataset

Using custom data configuration default
Reusing dataset multi_nli (/root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/393 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-38cc1a2c2aa9d1d7.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-112c5676e505bb61.arrow


(Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 392702
 }), Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 9815
 }))

# train dataの数を減らす処理を書く

In [58]:
model.train()
training_args = TrainingArguments(
    output_dir="./",          # 出力フォルダ
    num_train_epochs=NUM_EPOCHS,              # エポック数
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_train=True,
    warmup_steps=1000,
    weight_decay=0.01,
    #evaluate_during_training=True,
    logging_dir='./outputs/models/logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 392702
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 98176


Step,Training Loss
500,1.0216
1000,0.8037
1500,0.7757
2000,0.7156
2500,0.7295
3000,0.7176
3500,0.6955
4000,0.6837
4500,0.6777
5000,0.6656


Saving model checkpoint to ./checkpoint-500
Saving model checkpoint to ./checkpoint-1000
Saving model checkpoint to ./checkpoint-1500
Saving model checkpoint to ./checkpoint-2000
Saving model checkpoint to ./checkpoint-2500
Saving model checkpoint to ./checkpoint-3000
Saving model checkpoint to ./checkpoint-3500
Saving model checkpoint to ./checkpoint-4000
Saving model checkpoint to ./checkpoint-4500
Saving model checkpoint to ./checkpoint-5000
Saving model checkpoint to ./checkpoint-5500


KeyboardInterrupt: ignored

# 生成モデルの学習

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
generation_model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints

In [9]:
generation_datasets = raw_datasets.filter(lambda x:x["label"]==1)
generation_datasets = generation_datasets.remove_columns(
    ["promptID","pairID","premise_binary_parse","premise_parse","hypothesis_binary_parse", "hypothesis_parse","genre", "label"]
)
generation_datasets = generation_datasets.rename_column("hypothesis", "input")
generation_datasets = generation_datasets.rename_column("premise", "label")
generation_datasets["train"][0]

Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-85f791176ff67f6d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-a8aad63bd314eda0.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-4aae539223d7a5ab.arrow


{'input': 'Product and geography are what make cream skimming work. ',
 'label': 'Conceptually cream skimming has two basic dimensions - product and geography.'}

In [30]:
df_dataset = pd.DataFrame({
    "inputs":generation_datasets["train"]["input"],
    "label":generation_datasets["train"]["label"]
})
df_dataset = df_dataset.sample(100).reset_index(drop=True)
df_dataset.head(1)

Unnamed: 0,inputs,label
0,"Stevens was a talkative guy, and many couldn't...",You Stevens shut your trap! Muller's roar brou...


In [22]:
inputs = tokenizer.batch_encode_plus(
    df_dataset["inputs"].tolist(),
    return_tensors="pt", 
    add_special_tokens=False,
    truncation=True,
    padding="max_length",
    max_length=256
    )
labels = tokenizer.batch_encode_plus(
    df_dataset["label"].tolist(),
    return_tensors="pt", 
    add_special_tokens=True,
    truncation=True,
    padding="max_length",
    max_length=256
    )
train_data = []
for i in range(len(inputs["input_ids"])):
    train_data.append(
        {
            "input_ids":inputs["input_ids"][i],
            "token_type_ids":inputs["token_type_ids"][i],
            "attention_mask":inputs["attention_mask"][i],
            "label":labels["input_ids"][i] 
        }
    )
random.shuffle(train_data)
train_size = int(len(train_data)*0.98)
eval_data = train_data[train_size:]

In [23]:
generation_model.config.decoder_start_token_id = tokenizer.cls_token_id
generation_model.config.eos_token_id = tokenizer.sep_token_id
generation_model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
generation_model.config.vocab_size = generation_model.config.decoder.vocab_size
generation_model.config.max_length = 100
generation_model.config.min_length = 20
generation_model.config.no_repeat_ngram_size = 1
generation_model.config.early_stopping = True
generation_model.config.length_penalty = 2.0
generation_model.config.num_beams = 20


In [25]:
# Train Param
batch_size = 8
generation_model.train()
# https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./',
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=10,
    save_steps=30,
    eval_steps=5000,
    warmup_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=5,
    fp16=False,
    num_train_epochs=3,
    no_cuda=not CUDA_AVAILABLE
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=generation_model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data
)
trainer.train()

"""
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python pytorch_tutorial.py 
jupyter nbconvert --to script pytorch_tutorial.ipynb
"""

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39
The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss


Saving model checkpoint to ./checkpoint-30
tokenizer config file saved in ./checkpoint-30/tokenizer_config.json
Special tokens file saved in ./checkpoint-30/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




'\nCUDA_VISIBLE_DEVICES=0,1,2,3 python pytorch_tutorial.py \njupyter nbconvert --to script train_bert.ipynb\n'

In [26]:
created_model = generation_model.from_pretrained("./checkpoint-30")

In [47]:
tokenized = tokenizer(df_dataset["inputs"][0], return_tensors="pt", truncation=True, padding=True, max_length=256)
pred = created_model.generate(tokenized["input_ids"])
pred

tensor([[  101,  2441,  2701,  3844,  2330,  2992, 16813,  3423,  2457,  8689,
          3813, 10250, 12582, 11781, 13962, 12643, 19487,  3042,  2275,  9841,
          7752, 16475, 16615, 26165, 11345,  5923, 25724, 28509, 14355,  2529,
          3382,  2110,  3627,  8391, 15593,  7273,  7734, 19792, 12693,  2668,
         14468,  5122,  3142,  4393, 10038, 15097, 24417, 19472,  8734, 28350,
         16255,  9831,  3401,  6129,  2542, 17772, 11844, 24474, 19438,  2715,
          3147, 28248, 25021, 25931, 19649, 15969,  8108,  9033,  2100,  3218,
          2892,  4867,  3413, 26033, 18282, 10237, 19425,  4356,  2152,  2659,
          2677,  5698,  2465,  8268,  2593, 12828,  8982, 13373, 20056,  8998,
         24956,  3876,  2013,  2433,  3670,  2084,  2069,  2130,  7783,  8893]])

In [49]:
df_dataset["inputs"][0], tokenizer.decode(pred[0], skip_special_tokens=True, truncation=True, padding=True, max_length=256)

("Stevens was a talkative guy, and many couldn't stand him.",
 'opened closed shut open raised upheld legal court judgment firm cal jude erin brennan aidan solemn phone set dish sink dev sioux frankish pie bone tully warringlier human chance state rule realmais thai dakotajunstan blood burmese pale roman vampire saxon bavarian cambodian khmer byzantine sava wren feastcecing living hopeful grim vigor thom modern cold iced balthazar stil curran quentin dare siy practice cross narrow pass gall mold lancasterolate sight high low mouth demon class judicial either daemon grain norm precedent doctrine someplace settled from form expression than only even intermediate hybrid')