In [None]:
!git clone https://github.com/coaxsoft/pytorch_bert.git

Cloning into 'pytorch_bert'...
remote: Enumerating objects: 150, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 150 (delta 81), reused 141 (delta 74), pack-reused 0[K
Receiving objects: 100% (150/150), 29.05 MiB | 7.34 MiB/s, done.
Resolving deltas: 100% (81/81), done.


In [None]:
%cd pytorch_bert/

/content/pytorch_bert


In [None]:
!pip install -r requirements.txt

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->-r requirements.txt (line 3))
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-man

In [34]:
import random
import typing
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
from torch.utils.data import Dataset
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class IMDBBertDataset(Dataset):
    CLS = '[CLS]'
    PAD = '[PAD]'
    SEP = '[SEP]'
    MASK = '[MASK]'
    UNK = '[UNK]'

    MASK_PERCENTAGE = 0.15

    MASKED_INDICES_COLUMN = 'masked_indices'
    TARGET_COLUMN = 'indices'
    NSP_TARGET_COLUMN = 'is_next'
    TOKEN_MASK_COLUMN = 'token_mask'

    OPTIMAL_LENGTH_PERCENTILE = 70

    def __init__(self, path, ds_from=None, ds_to=None, should_include_text=False):
        # 读取CSV文件，并将'review'列赋值给self.ds
        self.ds: pd.Series = pd.read_csv(path)['review']

        # 如果指定了ds_from或ds_to，则对self.ds进行切片操作
        if ds_from is not None or ds_to is not None:
            self.ds = self.ds[ds_from:ds_to]

        # 获取基本英文分词器，并赋值给self.tokenizer
        self.tokenizer = get_tokenizer('basic_english')
        # 初始化计数器self.counter
        self.counter = Counter()
        # 初始化词汇表self.vocab，初始值为None
        self.vocab = None

        # 初始化最优句子长度self.optimal_sentence_length，初始值为None
        self.optimal_sentence_length = None
        # 初始化是否包含文本的标志self.should_include_text，并赋值
        self.should_include_text = should_include_text

        # 如果should_include_text为True，则设置self.columns为包含文本的列名列表
        if should_include_text:
            self.columns = ['masked_sentence', self.MASKED_INDICES_COLUMN, 'sentence', self.TARGET_COLUMN,
                            self.TOKEN_MASK_COLUMN,
                            self.NSP_TARGET_COLUMN]
        # 否则，设置self.columns为不包含文本的列名列表
        else:
            self.columns = [self.MASKED_INDICES_COLUMN, self.TARGET_COLUMN, self.TOKEN_MASK_COLUMN,
                            self.NSP_TARGET_COLUMN]

        # 调用prepare_dataset方法准备数据集，并将结果赋值给self.df
        self.df = self.prepare_dataset()
        print(len(self.df))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # 根据索引从数据框中获取数据项
        item = self.df.iloc[idx]

        # 将被遮盖的索引列转换为长整型张量
        inp = torch.Tensor(item[self.MASKED_INDICES_COLUMN]).long()
        # 将令牌掩码列转换为布尔型张量
        token_mask = torch.Tensor(item[self.TOKEN_MASK_COLUMN]).bool()

        # 将目标列转换为长整型张量，并用令牌掩码填充为0
        mask_target = torch.Tensor(item[self.TARGET_COLUMN]).long()
        mask_target = mask_target.masked_fill_(token_mask, 0)

        # 生成注意力掩码，判断输入张量中的元素是否等于填充符PAD
        attention_mask = (inp == self.vocab[self.PAD]).unsqueeze(0)

        # 根据NSP目标列的值，生成NSP目标张量
        if item[self.NSP_TARGET_COLUMN] == 0:
            t = [1, 0]
        else:
            t = [0, 1]
        nsp_target = torch.Tensor(t)

        return (
            # 将输入张量、注意力掩码、令牌掩码、目标张量和NSP目标张量转移到指定设备上
            inp.to(device),
            attention_mask.to(device),
            token_mask.to(device),
            mask_target.to(device),
            nsp_target.to(device)
        )

    def prepare_dataset(self) -> pd.DataFrame:
        # 存储句子的列表
        sentences = []
        # 存储NSP的列表
        nsp = []
        # 存储句子长度的列表
        sentence_lens = []

        # 将数据集按句子进行分割
        # Split dataset on sentences
        for review in self.ds:
            # 将每条评论按'. '分割成句子列表
            review_sentences = review.split('. ')
            # 将句子列表添加到sentences中
            sentences += review_sentences
            # 更新句子长度的列表
            self._update_length(review_sentences, sentence_lens)
        # 找到最优的句子长度
        self.optimal_sentence_length = self._find_optimal_sentence_length(sentence_lens)

        print("找到最优的句子长度:")
        print(self.optimal_sentence_length)

        print("Create vocabulary")
        # 遍历句子列表
        for sentence in tqdm(sentences):
            # 对句子进行分词
            s = self.tokenizer(sentence)
            # 更新词汇计数器
            self.counter.update(s)

        # 填充词汇表
        self._fill_vocab()

        print("词表词汇数：")
        print(len(self.vocab))

        print("Preprocessing dataset")
        # 遍历数据集
        for review in tqdm(self.ds):
            # 将评论按'. '分割成句子列表
            review_sentences = review.split('. ')
            # 如果句子数量大于1
            if len(review_sentences) > 1:
                # 遍历句子列表中的每个句子（除了最后一个）
                for i in range(len(review_sentences) - 1):
                    # 创建一个真正的NSP项
                    # True NSP item
                    first, second = self.tokenizer(review_sentences[i]), self.tokenizer(review_sentences[i + 1])
                    # 将NSP项添加到nsp列表中
                    nsp.append(self._create_item(first, second, 1))

                    # 创建一个虚假的NSP项
                    # False NSP item
                    first, second = self._select_false_nsp_sentences(sentences)
                    first, second = self.tokenizer(first), self.tokenizer(second)
                    # 将NSP项添加到nsp列表中
                    nsp.append(self._create_item(first, second, 0))

        # 将nsp列表转换为DataFrame
        df = pd.DataFrame(nsp, columns=self.columns)
        return df

    def _update_length(self, sentences: typing.List[str], lengths: typing.List[int]):
        for v in sentences:
            l = len(v.split())
            lengths.append(l)
        return lengths

    def _find_optimal_sentence_length(self, lengths: typing.List[int]):
        # 将长度列表转换为NumPy数组
        arr = np.array(lengths)
        # 计算数组中指定百分位数的值，并转换为整数返回
        return int(np.percentile(arr, self.OPTIMAL_LENGTH_PERCENTILE))

    def _fill_vocab(self):
        # specials= argument is only in 0.12.0 version
        # specials=[self.CLS, self.PAD, self.MASK, self.SEP, self.UNK]
        self.vocab = vocab(self.counter, min_freq=2)

        # 0.11.0 uses this approach to insert specials
        self.vocab.insert_token(self.CLS, 0)
        self.vocab.insert_token(self.PAD, 1)
        self.vocab.insert_token(self.MASK, 2)
        self.vocab.insert_token(self.SEP, 3)
        self.vocab.insert_token(self.UNK, 4)
        self.vocab.set_default_index(4)

    def _create_item(self, first: typing.List[str], second: typing.List[str], target: int = 1):
        # 创建带有掩码的句子项
        # Create masked sentence item
        updated_first, first_mask = self._preprocess_sentence(first.copy())
        updated_second, second_mask = self._preprocess_sentence(second.copy())

        nsp_sentence = updated_first + [self.SEP] + updated_second
        nsp_indices = self.vocab.lookup_indices(nsp_sentence)
        inverse_token_mask = first_mask + [True] + second_mask

        # 创建未随机掩码单词的句子项
        # Create sentence item without masking random words
        first, _ = self._preprocess_sentence(first.copy(), should_mask=False)
        second, _ = self._preprocess_sentence(second.copy(), should_mask=False)
        original_nsp_sentence = first + [self.SEP] + second
        original_nsp_indices = self.vocab.lookup_indices(original_nsp_sentence)

        if self.should_include_text:
            return (
                # 带有掩码的句子
                nsp_sentence,
                # 带有掩码的句子索引
                nsp_indices,
                # 原始句子
                original_nsp_sentence,
                # 原始句子索引
                original_nsp_indices,
                # 逆向标记掩码
                inverse_token_mask,
                # 目标值
                target
            )
        else:
            return (
                # 带有掩码的句子索引
                nsp_indices,
                # 原始句子索引
                original_nsp_indices,
                # 逆向标记掩码
                inverse_token_mask,
                # 目标值
                target
            )

    def _select_false_nsp_sentences(self, sentences: typing.List[str]):
        """Select sentences to create false NSP item

        Args:
            sentences: list of all sentences

        Returns:
            tuple of two sentences. The second one NOT the next sentence
        """
        sentences_len = len(sentences)
        sentence_index = random.randint(0, sentences_len - 1)
        next_sentence_index = random.randint(0, sentences_len - 1)

        # 确保它不是真正的下一句
        # To be sure that it's not real next sentence
        while next_sentence_index == sentence_index + 1:
            next_sentence_index = random.randint(0, sentences_len - 1)

        return sentences[sentence_index], sentences[next_sentence_index]

    def _preprocess_sentence(self, sentence: typing.List[str], should_mask: bool = True):
        # 初始化一个空的inverse_token_mask变量
        inverse_token_mask = None

        # 如果should_mask为True，则对句子进行掩码处理
        if should_mask:
            # 调用_mask_sentence方法对句子进行掩码处理，并返回处理后的句子和掩码后的inverse_token_mask
            sentence, inverse_token_mask = self._mask_sentence(sentence)

        # 确保 inverse_token_mask 是一个列表
        if inverse_token_mask is None:
            inverse_token_mask = []

        # 在句子开头添加[CLS]标记，并将inverse_token_mask的开头添加一个True
        # 调用_pad_sentence方法对句子进行填充处理，并返回处理后的句子和填充后的inverse_token_mask
        sentence, inverse_token_mask = self._pad_sentence([self.CLS] + sentence, [True] + inverse_token_mask)

        return sentence, inverse_token_mask

    def _mask_sentence(self, sentence: typing.List[str]):
        """Replace MASK_PERCENTAGE (15%) of words with special [MASK] symbol
        or with random word from vocabulary

        Args:
            sentence: sentence to process

        Returns:
            tuple of processed sentence and inverse token mask
        """
        len_s = len(sentence)
        inverse_token_mask = [True for _ in range(max(len_s, self.optimal_sentence_length))]

        # 计算需要遮盖的词的数量
        mask_amount = round(len_s * self.MASK_PERCENTAGE)
        for _ in range(mask_amount):
            # 随机选择一个词的索引
            i = random.randint(0, len_s - 1)

            # 以0.8的概率用[MASK]符号替换选中的词
            if random.random() < 0.8:
                sentence[i] = self.MASK
            else:
                # 以0.2的概率从词汇表中随机选择一个词替换选中的词
                # 注意：索引小于5的是特殊标记，参见self._insert_specials方法
                # All is below 5 is special token
                # see self._insert_specials method
                j = random.randint(5, len(self.vocab) - 1)
                sentence[i] = self.vocab.lookup_token(j)

            # 将遮盖词的索引位置在inverse_token_mask中标记为False
            inverse_token_mask[i] = False
        return sentence, inverse_token_mask

    def _pad_sentence(self, sentence: typing.List[str], inverse_token_mask: typing.List[bool] = None):
        len_s = len(sentence)

        # 如果句子长度大于等于最优句子长度
        if len_s >= self.optimal_sentence_length:
            # 取句子前最优句子长度的部分
            s = sentence[:self.optimal_sentence_length]
        else:
            # 否则，在句子末尾补充PAD，直到长度为最优句子长度
            s = sentence + [self.PAD] * (self.optimal_sentence_length - len_s)

        # 如果提供了inverse_token_mask，也需要进行填充
        # inverse token mask should be padded as well
        if inverse_token_mask:
            len_m = len(inverse_token_mask)

            # 如果inverse_token_mask的长度大于等于最优句子长度
            if len_m >= self.optimal_sentence_length:
                # 取inverse_token_mask前最优句子长度的部分
                inverse_token_mask = inverse_token_mask[:self.optimal_sentence_length]
            else:
                # 否则，在inverse_token_mask末尾补充True，直到长度为最优句子长度
                inverse_token_mask = inverse_token_mask + [True] * (self.optimal_sentence_length - len_m)

        return s, inverse_token_mask





In [39]:
import torch

from torch import nn
import torch.nn.functional as f


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class JointEmbedding(nn.Module):

    def __init__(self, vocab_size, size):
        # 调用父类的初始化方法
        super(JointEmbedding, self).__init__()

        # 初始化成员变量 size
        self.size = size

        # 初始化 token 的嵌入层，大小为 vocab_size x size
        self.token_emb = nn.Embedding(vocab_size, size)
        # 初始化 segment 的嵌入层，大小为 vocab_size x size
        self.segment_emb = nn.Embedding(vocab_size, size)

        # 初始化层归一化层，输入特征维度为 size
        self.norm = nn.LayerNorm(size)

    def forward(self, input_tensor):
        # 获取输入张量的句子大小
        sentence_size = input_tensor.size(-1)
        # 调用 attention_position 方法，传入 self.size 和 input_tensor，得到位置张量
        pos_tensor = self.attention_position(self.size, input_tensor)

        # 创建一个与 input_tensor 相同形状的零张量，并将其设备设为 device
        segment_tensor = torch.zeros_like(input_tensor).to(device)
        # 将 segment_tensor 中后半部分的元素设为 1
        segment_tensor[:, sentence_size // 2 + 1:] = 1

        # 对 input_tensor 进行 token_emb 嵌入，与 segment_tensor 进行 segment_emb 嵌入，并与 pos_tensor 相加，得到输出张量
        output = self.token_emb(input_tensor) + self.segment_emb(segment_tensor) + pos_tensor
        # 对输出张量进行归一化处理
        return self.norm(output)

    def attention_position(self, dim, input_tensor):
        # 获取输入张量的批处理大小
        batch_size = input_tensor.size(0)
        # 获取输入张量的句子长度
        sentence_size = input_tensor.size(-1)

        # 创建一个从0到句子长度的长整型张量，并指定设备
        pos = torch.arange(sentence_size, dtype=torch.long).to(device)
        # 创建一个从0到维度大小的长整型张量，并指定设备
        d = torch.arange(dim, dtype=torch.long).to(device)
        # 将维度张量转换为范围在0到2之间的浮点数张量
        d = (2 * d / dim)

        # 将位置张量增加一个新的维度
        pos = pos.unsqueeze(1)
        # 根据维度张量对位置张量进行缩放
        pos = pos / (1e4 ** d)

        # 对位置张量的偶数位置应用正弦函数
        pos[:, ::2] = torch.sin(pos[:, ::2])
        # 对位置张量的奇数位置应用余弦函数
        pos[:, 1::2] = torch.cos(pos[:, 1::2])

        # 将位置张量扩展为与输入张量相同大小的张量
        return pos.expand(batch_size, *pos.size())

    def numeric_position(self, dim, input_tensor):
        # 创建一个从0到dim-1的一维张量，数据类型为long，并将其转移到当前设备
        pos_tensor = torch.arange(dim, dtype=torch.long).to(device)
        # 将pos_tensor扩展为与input_tensor形状相同的张量
        return pos_tensor.expand_as(input_tensor)


class AttentionHead(nn.Module):

    def __init__(self, dim_inp, dim_out):
        # 调用父类的构造函数
        super(AttentionHead, self).__init__()

        # 初始化输入维度
        self.dim_inp = dim_inp

        # 初始化查询向量转换层
        self.q = nn.Linear(dim_inp, dim_out)
        # 初始化键向量转换层
        self.k = nn.Linear(dim_inp, dim_out)
        # 初始化值向量转换层
        self.v = nn.Linear(dim_inp, dim_out)

    def forward(self, input_tensor: torch.Tensor, attention_mask: torch.Tensor = None):
        # 获取query、key和value
        query, key, value = self.q(input_tensor), self.k(input_tensor), self.v(input_tensor)

        # 计算缩放因子
        scale = query.size(1) ** 0.5
        # 计算注意力分数
        scores = torch.bmm(query, key.transpose(1, 2)) / scale

        # 如果提供了注意力掩码，则使用掩码填充注意力分数
        scores = scores.masked_fill_(attention_mask, -1e9)
        # 对注意力分数应用softmax函数
        attn = f.softmax(scores, dim=-1)
        # 计算上下文向量
        context = torch.bmm(attn, value)

        return context


class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, dim_inp, dim_out):
        # 调用父类构造函数
        super(MultiHeadAttention, self).__init__()

        # 创建一个AttentionHead模块列表，根据num_heads的值初始化每个模块的参数
        self.heads = nn.ModuleList([
            # 创建一个AttentionHead模块，输入维度为dim_inp，输出维度为dim_out
            AttentionHead(dim_inp, dim_out) for _ in range(num_heads)
        ])
        # 创建一个线性层，输入维度为dim_out * num_heads，输出维度为dim_inp
        self.linear = nn.Linear(dim_out * num_heads, dim_inp)
        # 创建一个层归一化层，输入维度为dim_inp
        self.norm = nn.LayerNorm(dim_inp)

    def forward(self, input_tensor: torch.Tensor, attention_mask: torch.Tensor):
        # 遍历模型中的每个注意力头，并应用注意力机制
        s = [head(input_tensor, attention_mask) for head in self.heads]
        # 将所有注意力头的输出在最后一个维度上进行拼接
        scores = torch.cat(s, dim=-1)
        # 对拼接后的分数进行线性变换
        scores = self.linear(scores)
        # 对线性变换后的分数进行归一化处理
        return self.norm(scores)


class Encoder(nn.Module):

    def __init__(self, dim_inp, dim_out, attention_heads=4, dropout=0.1):
        super(Encoder, self).__init__()

        # 初始化多头注意力机制模块
        # 输入参数为：多头注意力头数、输入维度、输出维度
        # 输出维度为：batch_size x sentence size x dim_inp
        self.attention = MultiHeadAttention(attention_heads, dim_inp, dim_out)  # batch_size x sentence size x dim_inp

        # 初始化前馈神经网络模块
        # 包含线性层、dropout层、GELU激活函数、线性层、dropout层
        self.feed_forward = nn.Sequential(
            nn.Linear(dim_inp, dim_out),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(dim_out, dim_inp),
            nn.Dropout(dropout)
        )

        # 初始化层归一化模块
        # 输入参数为：输入维度
        self.norm = nn.LayerNorm(dim_inp)

    def forward(self, input_tensor: torch.Tensor, attention_mask: torch.Tensor):
        # 调用self.attention方法，传入input_tensor和attention_mask作为参数，得到context
        context = self.attention(input_tensor, attention_mask)
        # 调用self.feed_forward方法，传入context作为参数，得到res
        res = self.feed_forward(context)
        # 调用self.norm方法，传入res作为参数，得到最终结果并返回
        return self.norm(res)


class BERT(nn.Module):

    def __init__(self, vocab_size, dim_inp, dim_out, attention_heads=4):
        super(BERT, self).__init__()

        # 初始化联合嵌入层
        self.embedding = JointEmbedding(vocab_size, dim_inp)

        # 初始化编码器
        self.encoder = Encoder(dim_inp, dim_out, attention_heads)

        # 初始化单词预测层
        # 将输入维度dim_inp映射到词汇表大小vocab_size的线性层
        self.token_prediction_layer = nn.Linear(dim_inp, vocab_size)

        # 初始化Softmax层
        # 对输出进行对数Softmax运算
        self.softmax = nn.LogSoftmax(dim=-1)

        # 初始化分类层
        # 将输入维度dim_inp映射到2个类别的线性层
        self.classification_layer = nn.Linear(dim_inp, 2)

    def forward(self, input_tensor: torch.Tensor, attention_mask: torch.Tensor):
        # 将输入张量进行嵌入操作
        embedded = self.embedding(input_tensor)
        print("embedded shape")
        print(embedded.shape)

        # 将嵌入后的张量通过编码器进行编码
        encoded = self.encoder(embedded, attention_mask)

        # 对编码后的张量进行令牌预测
        token_predictions = self.token_prediction_layer(encoded)

        # 提取编码后张量的第一个词向量
        first_word = encoded[:, 0, :]

        # 对令牌预测结果应用softmax操作
        # 对第一个词向量进行分类
        return self.softmax(token_predictions), self.classification_layer(first_word)


In [38]:
import time
from datetime import datetime
from pathlib import Path

import torch

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def percentage(batch_size: int, max_index: int, current_index: int):
    """Calculate epoch progress percentage

    Args:
        batch_size: batch size
        max_index: max index in epoch
        current_index: current index

    Returns:
        Passed percentage of dataset
    """
    batched_max = max_index // batch_size
    return round(current_index / batched_max * 100, 2)


def nsp_accuracy(result: torch.Tensor, target: torch.Tensor):
    """Calculate NSP accuracy between two tensors

    Args:
        result: result calculated by model
        target: real target

    Returns:
        NSP accuracy
    """
    s = (result.argmax(1) == target.argmax(1)).sum()
    return round(float(s / result.size(0)), 2)


def token_accuracy(result: torch.Tensor, target: torch.Tensor, inverse_token_mask: torch.Tensor):
    """Calculate MLM accuracy between ONLY masked words

    Args:
        result: result calculated by model
        target: real target
        inverse_token_mask: well-known inverse token mask

    Returns:
        MLM accuracy
    """
    r = result.argmax(-1).masked_select(~inverse_token_mask)
    t = target.masked_select(~inverse_token_mask)
    s = (r == t).sum()
    return round(float(s / (result.size(0) * result.size(1))), 2)


class BertTrainer:

    def __init__(self,
                 model: BERT,
                 dataset: IMDBBertDataset,
                 log_dir: Path,
                 checkpoint_dir: Path = None,
                 print_progress_every: int = 10,
                 print_accuracy_every: int = 50,
                 batch_size: int = 24,
                 learning_rate: float = 0.005,
                 epochs: int = 5,
                 ):
        self.model = model
        self.dataset = dataset

        self.batch_size = batch_size
        self.epochs = epochs
        self.current_epoch = 0

        self.loader = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)

        self.writer = SummaryWriter(str(log_dir))
        self.checkpoint_dir = checkpoint_dir

        self.criterion = nn.BCEWithLogitsLoss().to(device)
        self.ml_criterion = nn.NLLLoss(ignore_index=0).to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.015)

        self._splitter_size = 35

        self._ds_len = len(self.dataset)
        self._batched_len = self._ds_len // self.batch_size

        self._print_every = print_progress_every
        self._accuracy_every = print_accuracy_every

    def print_summary(self):
        ds_len = len(self.dataset)

        print("Model Summary\n")
        print('=' * self._splitter_size)
        print(f"Device: {device}")
        print(f"Training dataset len: {ds_len}")
        print(f"Max / Optimal sentence len: {self.dataset.optimal_sentence_length}")
        print(f"Vocab size: {len(self.dataset.vocab)}")
        print(f"Batch size: {self.batch_size}")
        print(f"Batched dataset len: {self._batched_len}")
        print('=' * self._splitter_size)
        print()

    def __call__(self):
        for self.current_epoch in range(self.current_epoch, self.epochs):
            loss = self.train(self.current_epoch)
            self.save_checkpoint(self.current_epoch, step=-1, loss=loss)

    def train(self, epoch: int):
        print(f"Begin epoch {epoch}")

        prev = time.time()
        average_nsp_loss = 0
        average_mlm_loss = 0
        for i, value in enumerate(self.loader):
            index = i + 1
            inp, mask, inverse_token_mask, token_target, nsp_target = value
            self.optimizer.zero_grad()
            print("inp shape")
            print(inp.shape)
            token, nsp = self.model(inp, mask)

            tm = inverse_token_mask.unsqueeze(-1).expand_as(token)
            token = token.masked_fill(tm, 0)

            loss_token = self.ml_criterion(token.transpose(1, 2), token_target)  # 1D tensor as target is required
            loss_nsp = self.criterion(nsp, nsp_target)

            loss = loss_token + loss_nsp
            average_nsp_loss += loss_nsp
            average_mlm_loss += loss_token

            loss.backward()
            self.optimizer.step()

            if index % self._print_every == 0:
                elapsed = time.gmtime(time.time() - prev)
                s = self.training_summary(elapsed, index, average_nsp_loss, average_mlm_loss)

                if index % self._accuracy_every == 0:
                    s += self.accuracy_summary(index, token, nsp, token_target, nsp_target, inverse_token_mask)

                print(s)

                average_nsp_loss = 0
                average_mlm_loss = 0
        return loss

    def training_summary(self, elapsed, index, average_nsp_loss, average_mlm_loss):
        passed = percentage(self.batch_size, self._ds_len, index)
        global_step = self.current_epoch * len(self.loader) + index

        print_nsp_loss = average_nsp_loss / self._print_every
        print_mlm_loss = average_mlm_loss / self._print_every

        s = f"{time.strftime('%H:%M:%S', elapsed)}"
        s += f" | Epoch {self.current_epoch + 1} | {index} / {self._batched_len} ({passed}%) | " \
             f"NSP loss {print_nsp_loss:6.2f} | MLM loss {print_mlm_loss:6.2f}"

        self.writer.add_scalar("NSP loss", print_nsp_loss, global_step=global_step)
        self.writer.add_scalar("MLM loss", print_mlm_loss, global_step=global_step)
        return s

    def accuracy_summary(self, index, token, nsp, token_target, nsp_target, inverse_token_mask):
        global_step = self.current_epoch * len(self.loader) + index
        nsp_acc = nsp_accuracy(nsp, nsp_target)
        token_acc = token_accuracy(token, token_target, inverse_token_mask)

        self.writer.add_scalar("NSP train accuracy", nsp_acc, global_step=global_step)
        self.writer.add_scalar("Token train accuracy", token_acc, global_step=global_step)

        return f" | NSP accuracy {nsp_acc} | Token accuracy {token_acc}"

    def save_checkpoint(self, epoch, step, loss):
        if not self.checkpoint_dir:
            return

        prev = time.time()
        name = f"bert_epoch{epoch}_step{step}_{datetime.utcnow().timestamp():.0f}.pt"

        torch.save({
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': loss,
        }, self.checkpoint_dir.joinpath(name))

        print()
        print('=' * self._splitter_size)
        print(f"Model saved as '{name}' for {time.time() - prev:.2f}s")
        print('=' * self._splitter_size)
        print()

    def load_checkpoint(self, path: Path):
        print('=' * self._splitter_size)
        print(f"Restoring model {path}")
        checkpoint = torch.load(path)
        self.current_epoch = checkpoint['epoch']
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print("Model is restored.")
        print('=' * self._splitter_size)


In [40]:
import datetime

import torch

from pathlib import Path



EMB_SIZE = 64
HIDDEN_SIZE = 36
EPOCHS = 4
BATCH_SIZE = 12
NUM_HEADS = 4



timestamp = datetime.datetime.utcnow().timestamp()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    torch.cuda.empty_cache()

if __name__ == '__main__':
    print("Prepare dataset")
    ds = IMDBBertDataset('./data/imdb.csv', ds_from=0, ds_to=1000)

    bert = BERT(len(ds.vocab), EMB_SIZE, HIDDEN_SIZE, NUM_HEADS).to(device)
    trainer = BertTrainer(
        model=bert,
        dataset=ds,
        log_dir='data/logs/bert_experiment',
        checkpoint_dir='./data/bert_checkpoints',
        print_progress_every=20,
        print_accuracy_every=200,
        batch_size=BATCH_SIZE,
        learning_rate=0.00007,
        epochs=15
    )

    trainer.print_summary()
    trainer()


Prepare dataset
找到最优的句子长度:
27
Create vocabulary


100%|██████████| 9561/9561 [00:00<00:00, 42218.94it/s]


词表词汇数：
9626
Preprocessing dataset


100%|██████████| 1000/1000 [00:01<00:00, 560.38it/s]


17122
Model Summary

Device: cpu
Training dataset len: 17122
Max / Optimal sentence len: 27
Vocab size: 9626
Batch size: 12
Batched dataset len: 1426

Begin epoch 0
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Size([12, 55, 64])
inp shape
torch.Size([12, 55])
embedded shape
torch.Si

KeyboardInterrupt: 