# **89. 事前学習済み言語モデルからの転移学習**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# データダウンロード・Transformerのインストール
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip

! pip install pytorch-lightning==1.8.0
! pip install transformers==4.24.0

--2023-06-22 13:13:28--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘NewsAggregatorDataset.zip’

NewsAggregatorDatas     [   <=>              ]  27.87M  59.8MB/s    in 0.5s    

2023-06-22 13:13:28 (59.8 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203]

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning==1.8.0
  Downloading pytorch_lig

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# データの読み込み
df = pd.read_csv("newsCorpora.csv", sep="\t", names=("ID","TITLE","URL","PUBLISHER","CATEGORY","STORY","HOSTNAME","TIMESTAMP"))

# "TITLE"と"CATEGORY"を抽出
data = df.loc[df["PUBLISHER"].isin(["Reuters","Huffington Post","Businessweek","Contactmusic.com","Daily Mail"]), ["TITLE","CATEGORY"]]

# データ分割　学習:検証:テスト=8:1:1
train, others = train_test_split(data, test_size=0.2, random_state=0, shuffle=True)
dev, test = train_test_split(others, test_size=0.5, random_state=0, shuffle=True)

# ファイルに保存
train.to_csv("train.txt", sep="\t", index=None)
dev.to_csv("dev.txt", sep="\t", index=None)
test.to_csv("test.txt", sep="\t", index=None)

# 事例数の確認
print(f'学習データの事例数\n{train["CATEGORY"].value_counts()}\n')
print(f'検証データの事例数\n{dev["CATEGORY"].value_counts()}\n')
print(f'テストデータの事例数\n{test["CATEGORY"].value_counts()}\n')

学習データの事例数
b    4481
e    4240
t    1214
m     737
Name: CATEGORY, dtype: int64

検証データの事例数
b    575
e    528
t    137
m     94
Name: CATEGORY, dtype: int64

テストデータの事例数
b    571
e    511
t    173
m     79
Name: CATEGORY, dtype: int64



In [None]:
# ====================
# ライブラリの読み込み
# ====================

import glob
import random
from tqdm import tqdm

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [None]:
# ====================
# 前処理：データローダの作成
# ====================

# 単語分割器の読み込み
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# 最大文長の設定
max_length = 128

# ラベルと数値の変換
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}

def make_dataset(tokenizer, max_length, fname):
    dataset_for_loader = list()

    fin = open(fname, "r")
    next(fin)
    for line in fin:
        # ラベルとテキストを読み込み
        words = line.strip().split("\t")
        if len(words)==2: # おかしなデータが紛れている
            text, label = words
        else:
            continue

        # テキストをトークンに分割する。ただし、最大文長は "max_length" で指定したトークン数である。
        # 最大文長より短い文については、 "[PAD]" などの特殊トークンで残りの長さを埋める。
        # 最大文長を超える文については、はみ出す部分を無視する。
        encoding = tokenizer(text, max_length=max_length, padding="max_length", truncation=True)

        # tokenizerメソッドは辞書を返す。その辞書にラベルのIDも持たせる。
        encoding["labels"] = category_dict[label]

        # テンソルに変換
        encoding = {key: torch.tensor(value) for key, value in encoding.items()}

        # 前処理済みのデータを保存して次の文へ
        dataset_for_loader.append(encoding)
    fin.close()

    return dataset_for_loader

dataset_train = make_dataset(tokenizer, max_length, "train.txt")
dataset_val = make_dataset(tokenizer, max_length, "dev.txt")
dataset_test = make_dataset(tokenizer, max_length, "test.txt")

# データローダを作成。訓練用データはシャッフルしながら使う。
# 検証用と評価用は損失の勾配を計算する必要がないため、バッチサイズを大きめにとれる。
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=256, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=256, shuffle=False)

In [None]:
# ====================
# BERTによるテキスト分類
# ====================

class Bert4Classification(pl.LightningModule):

    # モデルの読み込みなど。損失関数は自動的に設定される。
    # num_labels == 1 -> 回帰タスクなので MSELoss()
    # num_labels > 1 -> 分類タスクなので CrossEntropyLoss()
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters()    # num_labelsとlrを保存する。例えば、self.hparams.lrでlrにアクセスできる。
        self.bert_sc = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 訓練用データのバッチを受け取って損失を計算
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log("train_loss", loss)
        return loss

    # 検証用データのバッチを受け取って損失を計算
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log("val_loss", val_loss)

    # 評価用データのバッチを受け取って分類の正解率を計算
    def test_step(self, batch, batch_idx):
        # ラベルの推定
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        # 正解率の計算
        labels = batch.pop("labels")
        num_correct = (labels_predicted == labels).sum().item()
        accuracy = num_correct / labels.size(0)
        self.log("accuracy", accuracy)

    # 最適化手法を設定
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [None]:
# ====================
# 訓練
# ====================

model = Bert4Classification(model_name, num_labels=len(category_dict), lr=1e-5)

# 訓練中にモデルを保存するための設定
checkpoint = pl.callbacks.ModelCheckpoint(
    # 検証用データにおける損失が最も小さいモデルを保存する
    monitor="val_loss", mode="min", save_top_k=3,
    # モデルファイル（重みのみ）を "model" というディレクトリに保存する
    save_weights_only=True, dirpath="model/"
)

# 訓練
trainer = pl.Trainer(gpus=1, max_epochs=3, callbacks=[checkpoint])
trainer.fit(model, dataloader_train, dataloader_val)

# ベストモデルの確認
print("ベストモデル: ", checkpoint.best_model_path)
print("ベストモデルの検証用データにおける損失: ", checkpoint.best_model_score)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


ベストモデル:  /content/model/epoch=1-step=668.ckpt
ベストモデルの検証用データにおける損失:  tensor(0.2011, device='cuda:0')


In [None]:
# ====================
# 評価
# ====================

test = trainer.test(dataloaders=dataloader_test)
print("Test accuracy = %.3f" % (test[0]["accuracy"]))

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/model/epoch=1-step=668.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from checkpoint at /content/model/epoch=1-step=668.ckpt


Testing: 0it [00:00, ?it/s]

Test accuracy = 0.933
