Source: https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

In [None]:
# Prepare Dataset
# install packages
!pip install transformers datasets tokenizers
!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -qq cornell_movie_dialogs_corpus.zip
!rm cornell_movie_dialogs_corpus.zip
!mkdir datasets
!mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
!mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manyl

In [None]:
## # import
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

In [None]:
MAX_LEN = 64

# Loading all data into memory
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

In [None]:
### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

In [None]:
### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []

        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()
        second = lines_dic[ids[i+1]].strip()

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

##  WordPiece Tokenization

In [None]:
# WordPiece tokenizer

### save data as txt file
os.mkdir('./data')
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)

    # once we hit the 10K mark, save to file
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

### training own tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train(
    files=paths,
    vocab_size=30_000,
    min_frequency=5,
    limit_alphabet=1000,
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

100%|██████████| 221616/221616 [00:00<00:00, 1559733.73it/s]


In [None]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):

        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(item)

        # Step 2: replace random words in sentence with mask / random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        '''return random sentence pair'''
        t1, t2 = self.get_corpus_line(index)

        # negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''return sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''return random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]

In [None]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)\
sample_data = next(iter(train_loader))

print(train_data[random.randrange(len(train_data))])

SyntaxError: ignored

In [None]:
class PositionalEmbedding(torch.nn.Module):

    def __init__(self, d_model, max_len=128):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        for pos in range(max_len):
            # for each dimension of the each position
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

        # include the batch size
        self.pe = pe.unsqueeze(0)
        # self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe

class BERTEmbedding(torch.nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """

        super().__init__()
        self.embed_size = embed_size
        # (m, seq_len) --> (m, seq_len, embed_size)
        # padding_idx is not updated during training, remains as fixed pad (0)
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

In [None]:
### attention layers
class MultiHeadedAttention(torch.nn.Module):

    def __init__(self, heads, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()

        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, d_model)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, d_model)
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # (batch_size, max_len, d_model) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))

        # fill 0 mask with super small number so it wont affect the softmax weight
        # (batch_size, h, max_len, max_len)
        scores = scores.masked_fill(mask == 0, -1e9)

        # (batch_size, h, max_len, max_len)
        # softmax to put attention weight for all non-pad tokens
        # max_len X max_len matrix of attention
        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)

        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        # (batch_size, max_len, d_model)
        return self.output_linear(context)

class FeedForward(torch.nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super(FeedForward, self).__init__()

        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(torch.nn.Module):
    def __init__(
        self,
        d_model=768,
        heads=12,
        feed_forward_hidden=768 * 4,
        dropout=0.1
        ):
        super(EncoderLayer, self).__init__()
        self.layernorm = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        # residual layer
        interacted = self.layernorm(interacted + embeddings)
        # bottleneck
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

In [None]:
class BERT(torch.nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads

        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)

        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info):
        # attention masking for padded token
        # (batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)

        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, mask)
        return x

class NextSentencePrediction(torch.nn.Module):
    """
    2-class classification model : is_next, is_not_next
    """

    def __init__(self, hidden):
        """
        :param hidden: BERT model output size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # use only the first token which is the [CLS]
        return self.softmax(self.linear(x[:, 0]))

class MaskedLanguageModel(torch.nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

class BERTLM(torch.nn.Module):
    """
    BERT Language Model
    Next Sentence Prediction Model + Masked Language Model
    """

    def __init__(self, bert: BERT, vocab_size):
        """
        :param bert: BERT model which should be trained
        :param vocab_size: total vocab size for masked_lm
        """

        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)
        self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)

In [None]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [None]:
class BERTTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        test_dataloader=None,
        lr= 1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=10000,
        log_freq=10,
        device='cuda'
        ):

        self.device = device
        self.model = model
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps
            )

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:

            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data["bert_label"].view(-1))
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))
        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        )

In [None]:
'''test run'''

train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

bert_model = BERT(
  vocab_size=len(tokenizer.vocab),
  d_model=768,
  n_layers=2,
  heads=12,
  dropout=0.1
)

bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
bert_trainer = BERTTrainer(bert_lm, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.train(epoch)

Total Parameters: 46699434


EP_train:0:   0%|| 1/6926 [00:11<22:31:59, 11.71s/it]

{'epoch': 0, 'iter': 0, 'avg_loss': 10.911469459533691, 'avg_acc': 56.25, 'loss': 10.911469459533691}


EP_train:0:   0%|| 11/6926 [01:05<10:13:09,  5.32s/it]

{'epoch': 0, 'iter': 10, 'avg_loss': 10.816637125882236, 'avg_acc': 51.98863636363637, 'loss': 10.71433162689209}


EP_train:0:   0%|| 21/6926 [01:57<10:03:55,  5.25s/it]

{'epoch': 0, 'iter': 20, 'avg_loss': 10.78262674240839, 'avg_acc': 51.19047619047619, 'loss': 10.68201732635498}


EP_train:0:   0%|| 31/6926 [02:50<10:00:56,  5.23s/it]

{'epoch': 0, 'iter': 30, 'avg_loss': 10.729352243484989, 'avg_acc': 51.20967741935484, 'loss': 10.538477897644043}


EP_train:0:   1%|| 41/6926 [03:43<9:56:13,  5.20s/it]

{'epoch': 0, 'iter': 40, 'avg_loss': 10.669340877998167, 'avg_acc': 52.0579268292683, 'loss': 10.434049606323242}


EP_train:0:   1%|| 51/6926 [04:37<10:01:37,  5.25s/it]

{'epoch': 0, 'iter': 50, 'avg_loss': 10.604647599014582, 'avg_acc': 51.28676470588235, 'loss': 10.317838668823242}


EP_train:0:   1%|| 61/6926 [05:31<10:35:37,  5.56s/it]

{'epoch': 0, 'iter': 60, 'avg_loss': 10.545363082260382, 'avg_acc': 51.63934426229508, 'loss': 10.150988578796387}


EP_train:0:   1%|| 71/6926 [06:25<9:58:43,  5.24s/it] 

{'epoch': 0, 'iter': 70, 'avg_loss': 10.490524493472677, 'avg_acc': 50.96830985915493, 'loss': 10.086946487426758}


EP_train:0:   1%|| 81/6926 [07:19<10:23:30,  5.47s/it]

{'epoch': 0, 'iter': 80, 'avg_loss': 10.440557833071109, 'avg_acc': 51.23456790123457, 'loss': 10.0697021484375}


EP_train:0:   1%|| 91/6926 [08:12<9:54:27,  5.22s/it] 

{'epoch': 0, 'iter': 90, 'avg_loss': 10.390255225883735, 'avg_acc': 50.755494505494504, 'loss': 9.871966361999512}


EP_train:0:   1%|| 101/6926 [09:06<10:12:57,  5.39s/it]

{'epoch': 0, 'iter': 100, 'avg_loss': 10.34093991836699, 'avg_acc': 50.99009900990099, 'loss': 9.806042671203613}


EP_train:0:   2%|| 111/6926 [10:03<11:24:36,  6.03s/it]

{'epoch': 0, 'iter': 110, 'avg_loss': 10.292244060619458, 'avg_acc': 50.929054054054056, 'loss': 9.830770492553711}


EP_train:0:   2%|| 121/6926 [10:56<9:59:46,  5.29s/it] 

{'epoch': 0, 'iter': 120, 'avg_loss': 10.237472077046544, 'avg_acc': 50.981404958677686, 'loss': 9.56406307220459}


EP_train:0:   2%|| 131/6926 [11:51<10:32:22,  5.58s/it]

{'epoch': 0, 'iter': 130, 'avg_loss': 10.182017319075024, 'avg_acc': 51.14503816793893, 'loss': 9.40656566619873}


EP_train:0:   2%|| 141/6926 [12:44<9:56:47,  5.28s/it] 

{'epoch': 0, 'iter': 140, 'avg_loss': 10.12651527350676, 'avg_acc': 51.10815602836879, 'loss': 9.193609237670898}


EP_train:0:   2%|| 151/6926 [13:38<10:12:33,  5.42s/it]

{'epoch': 0, 'iter': 150, 'avg_loss': 10.065764787181324, 'avg_acc': 50.95198675496688, 'loss': 9.024138450622559}


EP_train:0:   2%|| 161/6926 [14:31<9:40:18,  5.15s/it]

{'epoch': 0, 'iter': 160, 'avg_loss': 10.004589637614185, 'avg_acc': 51.08695652173913, 'loss': 9.243189811706543}


EP_train:0:   2%|| 171/6926 [15:25<10:11:38,  5.43s/it]

{'epoch': 0, 'iter': 170, 'avg_loss': 9.944770857604624, 'avg_acc': 51.27923976608187, 'loss': 9.09669303894043}


EP_train:0:   3%|| 181/6926 [16:20<10:10:26,  5.43s/it]

{'epoch': 0, 'iter': 180, 'avg_loss': 9.88215657228923, 'avg_acc': 51.07044198895028, 'loss': 8.827354431152344}


EP_train:0:   3%|| 191/6926 [17:14<10:10:16,  5.44s/it]

{'epoch': 0, 'iter': 190, 'avg_loss': 9.819761181376991, 'avg_acc': 51.014397905759154, 'loss': 8.287186622619629}


EP_train:0:   3%|| 201/6926 [18:07<9:49:31,  5.26s/it]

{'epoch': 0, 'iter': 200, 'avg_loss': 9.760896744419686, 'avg_acc': 50.82400497512438, 'loss': 8.625864028930664}


EP_train:0:   3%|| 211/6926 [19:01<9:57:36,  5.34s/it] 

{'epoch': 0, 'iter': 210, 'avg_loss': 9.695868469527547, 'avg_acc': 50.622037914691944, 'loss': 8.24477481842041}


EP_train:0:   3%|| 221/6926 [19:54<9:50:10,  5.28s/it]

{'epoch': 0, 'iter': 220, 'avg_loss': 9.638503048754385, 'avg_acc': 50.77771493212669, 'loss': 8.20501708984375}


EP_train:0:   3%|| 231/6926 [20:48<9:51:23,  5.30s/it] 

{'epoch': 0, 'iter': 230, 'avg_loss': 9.583613032386417, 'avg_acc': 50.879329004329, 'loss': 8.420318603515625}


EP_train:0:   3%|| 241/6926 [21:40<9:55:13,  5.34s/it]

{'epoch': 0, 'iter': 240, 'avg_loss': 9.525782866102036, 'avg_acc': 50.71317427385892, 'loss': 8.227885246276855}


EP_train:0:   4%|| 251/6926 [22:36<10:03:44,  5.43s/it]

{'epoch': 0, 'iter': 250, 'avg_loss': 9.464566794999568, 'avg_acc': 50.74701195219124, 'loss': 7.548211097717285}


EP_train:0:   4%|| 261/6926 [23:30<10:20:46,  5.59s/it]

{'epoch': 0, 'iter': 260, 'avg_loss': 9.410918275971978, 'avg_acc': 50.92193486590039, 'loss': 7.743703365325928}


EP_train:0:   4%|| 271/6926 [24:26<10:03:08,  5.44s/it]

{'epoch': 0, 'iter': 270, 'avg_loss': 9.357718638388434, 'avg_acc': 50.89944649446494, 'loss': 7.604986190795898}


EP_train:0:   4%|| 281/6926 [25:21<10:06:00,  5.47s/it]

{'epoch': 0, 'iter': 280, 'avg_loss': 9.311280510179513, 'avg_acc': 51.06761565836299, 'loss': 7.713982582092285}


EP_train:0:   4%|| 291/6926 [26:15<9:58:46,  5.41s/it]

{'epoch': 0, 'iter': 290, 'avg_loss': 9.266376523217794, 'avg_acc': 50.998711340206185, 'loss': 8.136502265930176}


EP_train:0:   4%|| 301/6926 [27:10<9:56:25,  5.40s/it] 

{'epoch': 0, 'iter': 300, 'avg_loss': 9.221592515013938, 'avg_acc': 51.0797342192691, 'loss': 7.937389373779297}


EP_train:0:   4%|| 311/6926 [28:05<10:28:37,  5.70s/it]

{'epoch': 0, 'iter': 310, 'avg_loss': 9.173889127958242, 'avg_acc': 50.86414790996785, 'loss': 7.576677322387695}


EP_train:0:   5%|| 321/6926 [29:00<9:37:44,  5.25s/it]

{'epoch': 0, 'iter': 320, 'avg_loss': 9.127501514470465, 'avg_acc': 50.866433021806856, 'loss': 7.559384346008301}


EP_train:0:   5%|| 331/6926 [29:57<10:58:35,  5.99s/it]

{'epoch': 0, 'iter': 330, 'avg_loss': 9.080808919001923, 'avg_acc': 50.698640483383684, 'loss': 7.584811210632324}


EP_train:0:   5%|| 341/6926 [30:52<9:52:44,  5.40s/it] 

{'epoch': 0, 'iter': 340, 'avg_loss': 9.037170626900412, 'avg_acc': 50.78812316715543, 'loss': 7.140135288238525}


EP_train:0:   5%|| 351/6926 [31:47<9:52:41,  5.41s/it] 

{'epoch': 0, 'iter': 350, 'avg_loss': 8.994889495719192, 'avg_acc': 50.79237891737892, 'loss': 7.248659610748291}


EP_train:0:   5%|| 361/6926 [32:43<10:15:18,  5.62s/it]

{'epoch': 0, 'iter': 360, 'avg_loss': 8.957462071711998, 'avg_acc': 50.82236842105263, 'loss': 7.508889198303223}


EP_train:0:   5%|| 371/6926 [33:34<9:20:55,  5.13s/it]

{'epoch': 0, 'iter': 370, 'avg_loss': 8.916847146746283, 'avg_acc': 50.81704851752021, 'loss': 7.579354286193848}


EP_train:0:   6%|| 381/6926 [34:27<9:48:06,  5.39s/it]

{'epoch': 0, 'iter': 380, 'avg_loss': 8.878703803215126, 'avg_acc': 50.9022309711286, 'loss': 7.514769077301025}


EP_train:0:   6%|| 391/6926 [35:20<9:27:44,  5.21s/it]

{'epoch': 0, 'iter': 390, 'avg_loss': 8.839476053672069, 'avg_acc': 50.97506393861892, 'loss': 7.144907474517822}


EP_train:0:   6%|| 401/6926 [36:15<9:58:08,  5.50s/it] 

{'epoch': 0, 'iter': 400, 'avg_loss': 8.803268063989957, 'avg_acc': 50.935162094763086, 'loss': 7.673941135406494}


EP_train:0:   6%|| 411/6926 [37:08<9:32:25,  5.27s/it]

{'epoch': 0, 'iter': 410, 'avg_loss': 8.767126598497377, 'avg_acc': 50.88199513381995, 'loss': 7.32290506362915}


EP_train:0:   6%|| 421/6926 [38:01<9:26:46,  5.23s/it]

{'epoch': 0, 'iter': 420, 'avg_loss': 8.727823164570642, 'avg_acc': 50.93527315914489, 'loss': 6.71901273727417}


EP_train:0:   6%|| 431/6926 [38:54<9:28:49,  5.25s/it]

{'epoch': 0, 'iter': 430, 'avg_loss': 8.690767454160616, 'avg_acc': 51.09483758700696, 'loss': 7.224283218383789}


EP_train:0:   6%|| 441/6926 [39:46<9:23:57,  5.22s/it]

{'epoch': 0, 'iter': 440, 'avg_loss': 8.655209274248742, 'avg_acc': 51.16213151927438, 'loss': 7.236321449279785}


EP_train:0:   7%|| 451/6926 [40:39<9:41:05,  5.38s/it]

{'epoch': 0, 'iter': 450, 'avg_loss': 8.621393580130093, 'avg_acc': 51.15022172949002, 'loss': 7.106378078460693}


EP_train:0:   7%|| 461/6926 [41:32<9:23:42,  5.23s/it]

{'epoch': 0, 'iter': 460, 'avg_loss': 8.58861198963157, 'avg_acc': 51.05070498915401, 'loss': 7.180912494659424}


EP_train:0:   7%|| 471/6926 [42:25<9:40:47,  5.40s/it]

{'epoch': 0, 'iter': 470, 'avg_loss': 8.554730396108768, 'avg_acc': 51.08147558386412, 'loss': 7.176949977874756}


EP_train:0:   7%|| 481/6926 [43:16<9:14:07,  5.16s/it]

{'epoch': 0, 'iter': 480, 'avg_loss': 8.524636401456013, 'avg_acc': 51.01351351351351, 'loss': 7.1140007972717285}


EP_train:0:   7%|| 491/6926 [44:09<9:31:14,  5.33s/it]

{'epoch': 0, 'iter': 490, 'avg_loss': 8.493296294979555, 'avg_acc': 51.01196537678207, 'loss': 7.628721714019775}


EP_train:0:   7%|| 501/6926 [45:01<9:18:41,  5.22s/it]

{'epoch': 0, 'iter': 500, 'avg_loss': 8.459842394449991, 'avg_acc': 51.03542914171657, 'loss': 7.131675720214844}


EP_train:0:   7%|| 511/6926 [45:54<9:34:58,  5.38s/it]

{'epoch': 0, 'iter': 510, 'avg_loss': 8.429969276234129, 'avg_acc': 50.978473581213315, 'loss': 7.116626262664795}


EP_train:0:   8%|| 521/6926 [46:47<9:14:35,  5.20s/it]

{'epoch': 0, 'iter': 520, 'avg_loss': 8.39781265844539, 'avg_acc': 50.8697216890595, 'loss': 6.542564868927002}


EP_train:0:   8%|| 531/6926 [47:42<10:12:41,  5.75s/it]

{'epoch': 0, 'iter': 530, 'avg_loss': 8.366487942172983, 'avg_acc': 50.80037664783428, 'loss': 6.740990161895752}


EP_train:0:   8%|| 541/6926 [48:34<9:17:48,  5.24s/it]

{'epoch': 0, 'iter': 540, 'avg_loss': 8.33860232252731, 'avg_acc': 50.947319778188536, 'loss': 6.855205535888672}


EP_train:0:   8%|| 551/6926 [49:27<9:42:31,  5.48s/it]

{'epoch': 0, 'iter': 550, 'avg_loss': 8.309950023727279, 'avg_acc': 50.9641560798548, 'loss': 6.8635969161987305}


EP_train:0:   8%|| 561/6926 [50:19<9:04:06,  5.13s/it]

{'epoch': 0, 'iter': 560, 'avg_loss': 8.2812789961191, 'avg_acc': 50.991532976827095, 'loss': 6.333866596221924}


EP_train:0:   8%|| 571/6926 [51:12<9:39:09,  5.47s/it]

{'epoch': 0, 'iter': 570, 'avg_loss': 8.25473693206306, 'avg_acc': 50.97964098073555, 'loss': 6.908053874969482}


EP_train:0:   8%|| 581/6926 [52:07<10:08:03,  5.75s/it]

{'epoch': 0, 'iter': 580, 'avg_loss': 8.228953288467329, 'avg_acc': 51.00043029259896, 'loss': 6.338192462921143}


EP_train:0:   9%|| 591/6926 [53:03<9:49:36,  5.58s/it]

{'epoch': 0, 'iter': 590, 'avg_loss': 8.201602334701874, 'avg_acc': 51.02580372250423, 'loss': 6.377570629119873}


EP_train:0:   9%|| 601/6926 [53:59<9:43:16,  5.53s/it] 

{'epoch': 0, 'iter': 600, 'avg_loss': 8.176689946909315, 'avg_acc': 50.972337770382694, 'loss': 7.002769470214844}


EP_train:0:   9%|| 611/6926 [54:55<9:40:15,  5.51s/it] 

{'epoch': 0, 'iter': 610, 'avg_loss': 8.150368168389154, 'avg_acc': 50.99222585924713, 'loss': 6.239799976348877}


EP_train:0:   9%|| 620/6926 [55:44<9:25:07,  5.38s/it]