In [1]:
#!pip install transformers

In [2]:
import os
import copy
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import transformers
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)

In [3]:
to_eng = { "wanita": "woman", "anak": "child", "bayi": "baby", "tas": "bag", "masker": "face mask", "pria": "men", "murah": "cheap", "tangan": "hand", "alat": "tool", "motif": "motive", "warna": "color", "bahan": "material", "celana": "pants", "baju": "clothes", "kaos": "t-shirt", "sepatu": "shoes", "rambut": "hair", "mainan": "toy", "sarung": "holster", "polos": "plain", "rak": "rack", "botol": "bottle", "sabun": "soap", "kain": "fabric", "panjang": "long", "kabel": "cable", "buku": "book", "plastik": "plastic", "mobil": "car", "hitam": "black", "karakter": "character", "putih": "white", "dompet": "purse", "kaki": "feet", "pembersih": "cleaners", "lipat": "folding", "silikon": "silicone", "minyak": "oil", "isi": "contents", "paket": "package", "susu": "milk", "gamis": "robe", "mandi": "bath", "madu": "honey", "kulit": "skin", "serbaguna": "multipurpose", "bisa": "can", "kacamata": "spectacles", "pendek": "short", "tali": "rope", "selempang": "sash", "topi": "hat", "obat": "drug", "gantungan": "hanger", "tahun": "year", "jilbab": "hijab", "dapur": "kitchen", "dinding": "wall", "kuas": "brush", "perempuan": "woman", "katun": "cotton", "sepeda": "bike", "lucu": "funny", "lengan": "arm", "kaca": "glass", "garansi": "warranty", "bunga": "flower", "handuk": "towel", "dewasa": "adult", "elektrik": "electric", "timbangan": "balance", "besar": "big", "bahan": "ingredient", "ransel": "backpack", "kertas": "paper"}
to_ind = {v: k for k, v in to_eng.items()}
to_ind_reg = {r'(\b){}(\b)'.format(k):r'\1{}\2'.format(v) for k,v in to_ind.items()}

In [4]:
train = pd.read_csv("folds.csv")
train['title'] = train.title.apply(lambda x: x.lower().replace('\\', ' '))
train['title'] = train['title'].replace(to_ind_reg, regex=True)
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,kertas tas victoria secret,249114794,0
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"double tape 3m vhb 12 mm x 4,5 m original / do...",2937985045,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,maling tts canned pork luncheon meat 397 gr,2395904891,0
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,daster batik lengan pendek - motif acak / camp...,4093212188,1
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,nescafe xc3 x89clair latte 220ml,3648931069,3


In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = '6, 7'

In [6]:
class CFG:
    DistilBERT = True # if set to False, BERT model will be used
    bert_hidden_size = 768
    
    batch_size = 512
    epochs = 100
    num_workers = 4
    learning_rate = 3e-5 #3e-5
    scheduler = "ReduceLROnPlateau"
    step = 'epoch'
    patience = 2
    factor = 0.8
    dropout = 0.5
    model_path = "./"
    max_length = 60
    model_save_name = "distil_bert.pt"
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [7]:
if CFG.DistilBERT:
    model_name='cahya/distilbert-base-indonesian'
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    bert_model = DistilBertModel.from_pretrained(model_name)
else:
    model_name='cahya/bert-base-indonesian-522M'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

In [8]:
text = train['title'].values[np.random.randint(0, len(train) - 1, 1)[0]]
print(f"Text of the title: {text}")
encoded_input = tokenizer(text, return_tensors='pt')
print(f"Input tokens: {encoded_input['input_ids']}")
decoded_input = tokenizer.decode(encoded_input['input_ids'][0])
print(f"Decoded tokens: {decoded_input}")
output = bert_model(**encoded_input)
print(f"last layer's output shape: {output.last_hidden_state.shape}")

Text of the title: kurma ajwa medina super/medina dates super (5kg)
Input tokens: tensor([[    3, 27294,  3044,  1627, 14795,  1990,  3830,    18, 14795,  1990,
         25288,  1018,  3830,    11,    24,  1028,  1014,    12,     1]])
Decoded tokens: [CLS] kurma ajwa medina super / medina dates super ( 5kg ) [SEP]
last layer's output shape: torch.Size([1, 19, 768])


In [9]:
lbl_encoder = LabelEncoder()
train['label_code'] = lbl_encoder.fit_transform(train['label_group'])
NUM_CLASSES = train['label_code'].nunique()

In [10]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['label_code'].values
        texts = list(dataframe['title'].apply(lambda o: str(o)).values)
        self.encodings = tokenizer(texts, 
                                   padding=True, 
                                   truncation=True, 
                                   max_length=max_length)
        self.mode = mode
        
        
        
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}
        # when testing, there are no targets so we won't do the following
        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx]).long()
        return item

In [11]:
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output

In [12]:
class BertModel(nn.Module):
    def __init__(self, 
                 bert_model, 
                 num_classes=NUM_CLASSES, 
                 last_hidden_size=CFG.bert_hidden_size):
        
        super().__init__()
        self.bert_model = bert_model
        self.arc_margin = ArcMarginProduct(last_hidden_size, 
                                           num_classes, 
                                           s=30.0, 
                                           m=0.50, 
                                           easy_margin=False)
    
    def get_bert_features(self, batch):
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        CLS_token_state = last_hidden_state[:, 0, :] # obtaining CLS token state which is the first token.
        return CLS_token_state
    
    def forward(self, batch):
        CLS_hidden_state = self.get_bert_features(batch)
        output = self.arc_margin(CLS_hidden_state, batch['labels'])
        return output

In [13]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def one_epoch(model, 
              criterion, 
              loader,
              optimizer=None, 
              lr_scheduler=None, 
              mode="train", 
              step="batch"):
    
    loss_meter = AvgMeter()
    acc_meter = AvgMeter()
    
    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(CFG.device) for k, v in batch.items()}
        preds = model(batch)
        loss = criterion(preds, batch['labels'])
        if mode == "train":
            optimizer.zero_grad()
            loss.mean().backward()
            optimizer.step()
            if step == "batch":
                lr_scheduler.step()
                
        count = batch['input_ids'].size(0)
        loss_meter.update(loss.item(), count)
        
        accuracy = get_accuracy(preds.detach(), batch['labels'])
        acc_meter.update(accuracy.item(), count)
        if mode == "train":
            tqdm_object.set_postfix(train_loss=loss_meter.avg, accuracy=acc_meter.avg, lr=get_lr(optimizer))
        else:
            tqdm_object.set_postfix(valid_loss=loss_meter.avg, accuracy=acc_meter.avg)
    
    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

def get_accuracy(preds, targets):
    """
    preds shape: (batch_size, num_labels)
    targets shape: (batch_size)
    """
    preds = preds.argmax(dim=1)
    acc = (preds == targets).float().mean()
    return acc

In [14]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, lr_scheduler=None):
    
    best_loss = float('inf')
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print("*" * 30)
        print(f"Epoch {epoch + 1}")
        current_lr = get_lr(optimizer)
        
        model.train()
        train_loss, train_acc = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          mode="train",
                                          step=CFG.step)                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              optimizer=None,
                                              lr_scheduler=None,
                                              mode="valid")
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.module.state_dict(), f'{CFG.model_path}/{CFG.model_save_name}')
            print("Saved best model!")
        
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(valid_loss.avg)
            if current_lr != get_lr(optimizer):
                print("Loading best model weights!")
                model.load_state_dict(torch.load(f'{CFG.model_path}/{CFG.model_save_name}')
                model = model.to(CFG.device)
        
        print("*" * 30)

In [15]:
# train_df = train[train.fold != 0]
train_df = train
valid_df = train[train.fold == 0]

train_dataset = TextDataset(train_df, tokenizer, max_length=CFG.max_length)
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=CFG.batch_size, 
                                           num_workers=CFG.num_workers, 
                                           shuffle=True)

valid_dataset = TextDataset(valid_df, tokenizer, max_length=CFG.max_length)
valid_loader = torch.utils.data.DataLoader(valid_dataset, 
                                           batch_size=CFG.batch_size, 
                                           num_workers=CFG.num_workers, 
                                           shuffle=False)

In [16]:
model = torch.nn.DataParallel(BertModel(bert_model).to(CFG.device))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CFG.learning_rate)
if CFG.scheduler == "ReduceLROnPlateau":
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                              mode="min", 
                                                              factor=CFG.factor, 
                                                              patience=CFG.patience)

train_eval(CFG.epochs, model, train_loader, valid_loader,
           criterion, optimizer, lr_scheduler=lr_scheduler)

  0%|          | 0/67 [00:00<?, ?it/s]

******************************
Epoch 1


100%|██████████| 67/67 [00:36<00:00,  1.86it/s, accuracy=0, lr=3e-5, train_loss=23.9]
100%|██████████| 14/14 [00:02<00:00,  5.07it/s, accuracy=0, valid_loss=23.1]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 2


100%|██████████| 67/67 [00:32<00:00,  2.05it/s, accuracy=0, lr=3e-5, train_loss=22.7]
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.000438, valid_loss=21.5]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 3


100%|██████████| 67/67 [00:32<00:00,  2.06it/s, accuracy=0.00269, lr=3e-5, train_loss=21.3] 
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.00701, valid_loss=20]  
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 4


100%|██████████| 67/67 [00:32<00:00,  2.05it/s, accuracy=0.00917, lr=3e-5, train_loss=20]  
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.0185, valid_loss=18.7]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 5


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0165, lr=3e-5, train_loss=19]  
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.0296, valid_loss=17.7]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 6


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0253, lr=3e-5, train_loss=18.1]
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.0393, valid_loss=16.7]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 7


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0332, lr=3e-5, train_loss=17.2]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.0485, valid_loss=15.8]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 8


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0412, lr=3e-5, train_loss=16.4]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.0607, valid_loss=14.9]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 9


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.049, lr=3e-5, train_loss=15.7] 
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.0721, valid_loss=14.1]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 10


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0573, lr=3e-5, train_loss=15]  
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.0857, valid_loss=13.4]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 11


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0652, lr=3e-5, train_loss=14.4]
100%|██████████| 14/14 [00:02<00:00,  4.79it/s, accuracy=0.0975, valid_loss=12.7]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 12


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.0746, lr=3e-5, train_loss=13.8]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.113, valid_loss=12.1]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 13


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0846, lr=3e-5, train_loss=13.2]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.127, valid_loss=11.4]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 14


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.0953, lr=3e-5, train_loss=12.6]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.142, valid_loss=10.8]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 15


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.107, lr=3e-5, train_loss=12.1]
100%|██████████| 14/14 [00:02<00:00,  4.91it/s, accuracy=0.16, valid_loss=10.3] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 16


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.119, lr=3e-5, train_loss=11.6]
100%|██████████| 14/14 [00:02<00:00,  4.93it/s, accuracy=0.175, valid_loss=9.72]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 17


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.131, lr=3e-5, train_loss=11.1]
100%|██████████| 14/14 [00:02<00:00,  4.77it/s, accuracy=0.192, valid_loss=9.2] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 18


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.143, lr=3e-5, train_loss=10.6]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.209, valid_loss=8.7] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 19


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.157, lr=3e-5, train_loss=10.1]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.229, valid_loss=8.2] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 20


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.169, lr=3e-5, train_loss=9.7] 
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.247, valid_loss=7.75]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 21


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.183, lr=3e-5, train_loss=9.27]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.264, valid_loss=7.29]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 22


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.196, lr=3e-5, train_loss=8.87]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.284, valid_loss=6.88]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 23


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.212, lr=3e-5, train_loss=8.47]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.306, valid_loss=6.45]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 24


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.226, lr=3e-5, train_loss=8.08]
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.329, valid_loss=6.07]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 25


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.241, lr=3e-5, train_loss=7.71]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.352, valid_loss=5.69]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 26


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.258, lr=3e-5, train_loss=7.34]
100%|██████████| 14/14 [00:02<00:00,  4.93it/s, accuracy=0.374, valid_loss=5.32]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 27


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.272, lr=3e-5, train_loss=7]   
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.402, valid_loss=4.98]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 28


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.29, lr=3e-5, train_loss=6.66] 
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.43, valid_loss=4.64] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 29


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.308, lr=3e-5, train_loss=6.33]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.456, valid_loss=4.32]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 30


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.325, lr=3e-5, train_loss=6.02]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.479, valid_loss=4.01]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 31


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.346, lr=3e-5, train_loss=5.71]
100%|██████████| 14/14 [00:02<00:00,  4.88it/s, accuracy=0.512, valid_loss=3.71]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 32


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.367, lr=3e-5, train_loss=5.41]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.541, valid_loss=3.44]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 33


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.388, lr=3e-5, train_loss=5.13]
100%|██████████| 14/14 [00:02<00:00,  5.06it/s, accuracy=0.57, valid_loss=3.18] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 34


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.413, lr=3e-5, train_loss=4.86]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.597, valid_loss=2.93]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 35


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.433, lr=3e-5, train_loss=4.6] 
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.623, valid_loss=2.7] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 36


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.458, lr=3e-5, train_loss=4.34]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.646, valid_loss=2.48]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 37


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.483, lr=3e-5, train_loss=4.1] 
100%|██████████| 14/14 [00:02<00:00,  4.77it/s, accuracy=0.67, valid_loss=2.28] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 38


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.506, lr=3e-5, train_loss=3.86]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.696, valid_loss=2.09]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 39


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.528, lr=3e-5, train_loss=3.64]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.717, valid_loss=1.92]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 40


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.552, lr=3e-5, train_loss=3.43]
100%|██████████| 14/14 [00:02<00:00,  4.78it/s, accuracy=0.743, valid_loss=1.75]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 41


100%|██████████| 67/67 [00:33<00:00,  2.02it/s, accuracy=0.575, lr=3e-5, train_loss=3.23]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.764, valid_loss=1.61]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 42


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.596, lr=3e-5, train_loss=3.03]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.783, valid_loss=1.46]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 43


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.618, lr=3e-5, train_loss=2.85]
100%|██████████| 14/14 [00:02<00:00,  4.81it/s, accuracy=0.802, valid_loss=1.33]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 44


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.639, lr=3e-5, train_loss=2.67]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.821, valid_loss=1.21]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 45


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.661, lr=3e-5, train_loss=2.5] 
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.836, valid_loss=1.1] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 46


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.679, lr=3e-5, train_loss=2.35]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.852, valid_loss=1]    
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 47


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.698, lr=3e-5, train_loss=2.2] 
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.867, valid_loss=0.911]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 48


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.717, lr=3e-5, train_loss=2.06]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.88, valid_loss=0.829] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 49


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.732, lr=3e-5, train_loss=1.93]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.894, valid_loss=0.758]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 50


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.752, lr=3e-5, train_loss=1.8] 
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.905, valid_loss=0.685]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 51


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.771, lr=3e-5, train_loss=1.68]
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.917, valid_loss=0.621]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 52


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.785, lr=3e-5, train_loss=1.57]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.926, valid_loss=0.567]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 53


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.8, lr=3e-5, train_loss=1.46]  
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.935, valid_loss=0.518]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 54


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.815, lr=3e-5, train_loss=1.37]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.943, valid_loss=0.468]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 55


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.828, lr=3e-5, train_loss=1.28]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.949, valid_loss=0.426]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 56


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.843, lr=3e-5, train_loss=1.19]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.956, valid_loss=0.389]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 57


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.856, lr=3e-5, train_loss=1.1] 
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.96, valid_loss=0.363] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 58


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.87, lr=3e-5, train_loss=1.03] 
100%|██████████| 14/14 [00:02<00:00,  4.91it/s, accuracy=0.964, valid_loss=0.326]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 59


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.88, lr=3e-5, train_loss=0.962] 
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.969, valid_loss=0.3]  
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 60


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.892, lr=3e-5, train_loss=0.894]
100%|██████████| 14/14 [00:02<00:00,  4.73it/s, accuracy=0.974, valid_loss=0.279]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 61


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.904, lr=3e-5, train_loss=0.833]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.976, valid_loss=0.265]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 62


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.912, lr=3e-5, train_loss=0.781]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.979, valid_loss=0.244]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 63


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.922, lr=3e-5, train_loss=0.723]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.982, valid_loss=0.226]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 64


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.928, lr=3e-5, train_loss=0.678]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.983, valid_loss=0.218]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 65


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.936, lr=3e-5, train_loss=0.635]
100%|██████████| 14/14 [00:02<00:00,  4.93it/s, accuracy=0.985, valid_loss=0.2]  
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 66


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.942, lr=3e-5, train_loss=0.594]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.986, valid_loss=0.189]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 67


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.948, lr=3e-5, train_loss=0.556]
100%|██████████| 14/14 [00:02<00:00,  5.05it/s, accuracy=0.987, valid_loss=0.18] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 68


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.953, lr=3e-5, train_loss=0.524]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.987, valid_loss=0.176]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 69


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.959, lr=3e-5, train_loss=0.492]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.987, valid_loss=0.173]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 70


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.962, lr=3e-5, train_loss=0.465]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.988, valid_loss=0.163]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 71


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.966, lr=3e-5, train_loss=0.435]
100%|██████████| 14/14 [00:02<00:00,  5.00it/s, accuracy=0.989, valid_loss=0.158]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 72


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.969, lr=3e-5, train_loss=0.41] 
100%|██████████| 14/14 [00:02<00:00,  4.80it/s, accuracy=0.989, valid_loss=0.151]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 73


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.973, lr=3e-5, train_loss=0.389]
100%|██████████| 14/14 [00:02<00:00,  4.99it/s, accuracy=0.989, valid_loss=0.147]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 74


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.974, lr=3e-5, train_loss=0.372]
100%|██████████| 14/14 [00:02<00:00,  4.91it/s, accuracy=0.99, valid_loss=0.148] 
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 75


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.976, lr=3e-5, train_loss=0.355]
100%|██████████| 14/14 [00:02<00:00,  5.06it/s, accuracy=0.99, valid_loss=0.14]  
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 76


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.978, lr=3e-5, train_loss=0.338]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.99, valid_loss=0.14]  
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 77


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.98, lr=3e-5, train_loss=0.322] 
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.991, valid_loss=0.14] 
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 78


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.981, lr=3e-5, train_loss=0.307]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.991, valid_loss=0.134]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 79


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.982, lr=3e-5, train_loss=0.296]
100%|██████████| 14/14 [00:02<00:00,  5.05it/s, accuracy=0.992, valid_loss=0.13] 
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 80


100%|██████████| 67/67 [00:33<00:00,  2.02it/s, accuracy=0.984, lr=3e-5, train_loss=0.286]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.992, valid_loss=0.13] 
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 81


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.986, lr=3e-5, train_loss=0.271]
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.992, valid_loss=0.128]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 82


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.985, lr=3e-5, train_loss=0.26] 
100%|██████████| 14/14 [00:02<00:00,  5.05it/s, accuracy=0.992, valid_loss=0.126]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 83


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.986, lr=3e-5, train_loss=0.252]
100%|██████████| 14/14 [00:02<00:00,  5.04it/s, accuracy=0.993, valid_loss=0.122]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 84


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.987, lr=3e-5, train_loss=0.244]
100%|██████████| 14/14 [00:02<00:00,  5.01it/s, accuracy=0.993, valid_loss=0.123]
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 85


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.987, lr=3e-5, train_loss=0.24] 
100%|██████████| 14/14 [00:02<00:00,  5.02it/s, accuracy=0.993, valid_loss=0.123]
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 86


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.987, lr=3e-5, train_loss=0.232]
100%|██████████| 14/14 [00:02<00:00,  4.89it/s, accuracy=0.993, valid_loss=0.121]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 87


100%|██████████| 67/67 [00:33<00:00,  2.03it/s, accuracy=0.987, lr=3e-5, train_loss=0.225]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.993, valid_loss=0.122]
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 88


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.989, lr=3e-5, train_loss=0.216]
100%|██████████| 14/14 [00:02<00:00,  4.93it/s, accuracy=0.993, valid_loss=0.121]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 89


100%|██████████| 67/67 [00:32<00:00,  2.03it/s, accuracy=0.989, lr=3e-5, train_loss=0.209]
100%|██████████| 14/14 [00:02<00:00,  5.03it/s, accuracy=0.993, valid_loss=0.114]
  0%|          | 0/67 [00:00<?, ?it/s]

Saved best model!
******************************
******************************
Epoch 90


100%|██████████| 67/67 [00:33<00:00,  2.02it/s, accuracy=0.989, lr=3e-5, train_loss=0.204]
100%|██████████| 14/14 [00:02<00:00,  4.98it/s, accuracy=0.994, valid_loss=0.116]
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 91


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.989, lr=3e-5, train_loss=0.201]
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.994, valid_loss=0.119]
  0%|          | 0/67 [00:00<?, ?it/s]

******************************
******************************
Epoch 92


100%|██████████| 67/67 [00:32<00:00,  2.04it/s, accuracy=0.99, lr=3e-5, train_loss=0.199] 
100%|██████████| 14/14 [00:02<00:00,  4.97it/s, accuracy=0.993, valid_loss=0.116]


Loading best model weights!


RuntimeError: Error(s) in loading state_dict for DataParallel:
	Missing key(s) in state_dict: "module.bert_model.embeddings.word_embeddings.weight", "module.bert_model.embeddings.position_embeddings.weight", "module.bert_model.embeddings.LayerNorm.weight", "module.bert_model.embeddings.LayerNorm.bias", "module.bert_model.transformer.layer.0.attention.q_lin.weight", "module.bert_model.transformer.layer.0.attention.q_lin.bias", "module.bert_model.transformer.layer.0.attention.k_lin.weight", "module.bert_model.transformer.layer.0.attention.k_lin.bias", "module.bert_model.transformer.layer.0.attention.v_lin.weight", "module.bert_model.transformer.layer.0.attention.v_lin.bias", "module.bert_model.transformer.layer.0.attention.out_lin.weight", "module.bert_model.transformer.layer.0.attention.out_lin.bias", "module.bert_model.transformer.layer.0.sa_layer_norm.weight", "module.bert_model.transformer.layer.0.sa_layer_norm.bias", "module.bert_model.transformer.layer.0.ffn.lin1.weight", "module.bert_model.transformer.layer.0.ffn.lin1.bias", "module.bert_model.transformer.layer.0.ffn.lin2.weight", "module.bert_model.transformer.layer.0.ffn.lin2.bias", "module.bert_model.transformer.layer.0.output_layer_norm.weight", "module.bert_model.transformer.layer.0.output_layer_norm.bias", "module.bert_model.transformer.layer.1.attention.q_lin.weight", "module.bert_model.transformer.layer.1.attention.q_lin.bias", "module.bert_model.transformer.layer.1.attention.k_lin.weight", "module.bert_model.transformer.layer.1.attention.k_lin.bias", "module.bert_model.transformer.layer.1.attention.v_lin.weight", "module.bert_model.transformer.layer.1.attention.v_lin.bias", "module.bert_model.transformer.layer.1.attention.out_lin.weight", "module.bert_model.transformer.layer.1.attention.out_lin.bias", "module.bert_model.transformer.layer.1.sa_layer_norm.weight", "module.bert_model.transformer.layer.1.sa_layer_norm.bias", "module.bert_model.transformer.layer.1.ffn.lin1.weight", "module.bert_model.transformer.layer.1.ffn.lin1.bias", "module.bert_model.transformer.layer.1.ffn.lin2.weight", "module.bert_model.transformer.layer.1.ffn.lin2.bias", "module.bert_model.transformer.layer.1.output_layer_norm.weight", "module.bert_model.transformer.layer.1.output_layer_norm.bias", "module.bert_model.transformer.layer.2.attention.q_lin.weight", "module.bert_model.transformer.layer.2.attention.q_lin.bias", "module.bert_model.transformer.layer.2.attention.k_lin.weight", "module.bert_model.transformer.layer.2.attention.k_lin.bias", "module.bert_model.transformer.layer.2.attention.v_lin.weight", "module.bert_model.transformer.layer.2.attention.v_lin.bias", "module.bert_model.transformer.layer.2.attention.out_lin.weight", "module.bert_model.transformer.layer.2.attention.out_lin.bias", "module.bert_model.transformer.layer.2.sa_layer_norm.weight", "module.bert_model.transformer.layer.2.sa_layer_norm.bias", "module.bert_model.transformer.layer.2.ffn.lin1.weight", "module.bert_model.transformer.layer.2.ffn.lin1.bias", "module.bert_model.transformer.layer.2.ffn.lin2.weight", "module.bert_model.transformer.layer.2.ffn.lin2.bias", "module.bert_model.transformer.layer.2.output_layer_norm.weight", "module.bert_model.transformer.layer.2.output_layer_norm.bias", "module.bert_model.transformer.layer.3.attention.q_lin.weight", "module.bert_model.transformer.layer.3.attention.q_lin.bias", "module.bert_model.transformer.layer.3.attention.k_lin.weight", "module.bert_model.transformer.layer.3.attention.k_lin.bias", "module.bert_model.transformer.layer.3.attention.v_lin.weight", "module.bert_model.transformer.layer.3.attention.v_lin.bias", "module.bert_model.transformer.layer.3.attention.out_lin.weight", "module.bert_model.transformer.layer.3.attention.out_lin.bias", "module.bert_model.transformer.layer.3.sa_layer_norm.weight", "module.bert_model.transformer.layer.3.sa_layer_norm.bias", "module.bert_model.transformer.layer.3.ffn.lin1.weight", "module.bert_model.transformer.layer.3.ffn.lin1.bias", "module.bert_model.transformer.layer.3.ffn.lin2.weight", "module.bert_model.transformer.layer.3.ffn.lin2.bias", "module.bert_model.transformer.layer.3.output_layer_norm.weight", "module.bert_model.transformer.layer.3.output_layer_norm.bias", "module.bert_model.transformer.layer.4.attention.q_lin.weight", "module.bert_model.transformer.layer.4.attention.q_lin.bias", "module.bert_model.transformer.layer.4.attention.k_lin.weight", "module.bert_model.transformer.layer.4.attention.k_lin.bias", "module.bert_model.transformer.layer.4.attention.v_lin.weight", "module.bert_model.transformer.layer.4.attention.v_lin.bias", "module.bert_model.transformer.layer.4.attention.out_lin.weight", "module.bert_model.transformer.layer.4.attention.out_lin.bias", "module.bert_model.transformer.layer.4.sa_layer_norm.weight", "module.bert_model.transformer.layer.4.sa_layer_norm.bias", "module.bert_model.transformer.layer.4.ffn.lin1.weight", "module.bert_model.transformer.layer.4.ffn.lin1.bias", "module.bert_model.transformer.layer.4.ffn.lin2.weight", "module.bert_model.transformer.layer.4.ffn.lin2.bias", "module.bert_model.transformer.layer.4.output_layer_norm.weight", "module.bert_model.transformer.layer.4.output_layer_norm.bias", "module.bert_model.transformer.layer.5.attention.q_lin.weight", "module.bert_model.transformer.layer.5.attention.q_lin.bias", "module.bert_model.transformer.layer.5.attention.k_lin.weight", "module.bert_model.transformer.layer.5.attention.k_lin.bias", "module.bert_model.transformer.layer.5.attention.v_lin.weight", "module.bert_model.transformer.layer.5.attention.v_lin.bias", "module.bert_model.transformer.layer.5.attention.out_lin.weight", "module.bert_model.transformer.layer.5.attention.out_lin.bias", "module.bert_model.transformer.layer.5.sa_layer_norm.weight", "module.bert_model.transformer.layer.5.sa_layer_norm.bias", "module.bert_model.transformer.layer.5.ffn.lin1.weight", "module.bert_model.transformer.layer.5.ffn.lin1.bias", "module.bert_model.transformer.layer.5.ffn.lin2.weight", "module.bert_model.transformer.layer.5.ffn.lin2.bias", "module.bert_model.transformer.layer.5.output_layer_norm.weight", "module.bert_model.transformer.layer.5.output_layer_norm.bias", "module.arc_margin.weight". 
	Unexpected key(s) in state_dict: "bert_model.embeddings.word_embeddings.weight", "bert_model.embeddings.position_embeddings.weight", "bert_model.embeddings.LayerNorm.weight", "bert_model.embeddings.LayerNorm.bias", "bert_model.transformer.layer.0.attention.q_lin.weight", "bert_model.transformer.layer.0.attention.q_lin.bias", "bert_model.transformer.layer.0.attention.k_lin.weight", "bert_model.transformer.layer.0.attention.k_lin.bias", "bert_model.transformer.layer.0.attention.v_lin.weight", "bert_model.transformer.layer.0.attention.v_lin.bias", "bert_model.transformer.layer.0.attention.out_lin.weight", "bert_model.transformer.layer.0.attention.out_lin.bias", "bert_model.transformer.layer.0.sa_layer_norm.weight", "bert_model.transformer.layer.0.sa_layer_norm.bias", "bert_model.transformer.layer.0.ffn.lin1.weight", "bert_model.transformer.layer.0.ffn.lin1.bias", "bert_model.transformer.layer.0.ffn.lin2.weight", "bert_model.transformer.layer.0.ffn.lin2.bias", "bert_model.transformer.layer.0.output_layer_norm.weight", "bert_model.transformer.layer.0.output_layer_norm.bias", "bert_model.transformer.layer.1.attention.q_lin.weight", "bert_model.transformer.layer.1.attention.q_lin.bias", "bert_model.transformer.layer.1.attention.k_lin.weight", "bert_model.transformer.layer.1.attention.k_lin.bias", "bert_model.transformer.layer.1.attention.v_lin.weight", "bert_model.transformer.layer.1.attention.v_lin.bias", "bert_model.transformer.layer.1.attention.out_lin.weight", "bert_model.transformer.layer.1.attention.out_lin.bias", "bert_model.transformer.layer.1.sa_layer_norm.weight", "bert_model.transformer.layer.1.sa_layer_norm.bias", "bert_model.transformer.layer.1.ffn.lin1.weight", "bert_model.transformer.layer.1.ffn.lin1.bias", "bert_model.transformer.layer.1.ffn.lin2.weight", "bert_model.transformer.layer.1.ffn.lin2.bias", "bert_model.transformer.layer.1.output_layer_norm.weight", "bert_model.transformer.layer.1.output_layer_norm.bias", "bert_model.transformer.layer.2.attention.q_lin.weight", "bert_model.transformer.layer.2.attention.q_lin.bias", "bert_model.transformer.layer.2.attention.k_lin.weight", "bert_model.transformer.layer.2.attention.k_lin.bias", "bert_model.transformer.layer.2.attention.v_lin.weight", "bert_model.transformer.layer.2.attention.v_lin.bias", "bert_model.transformer.layer.2.attention.out_lin.weight", "bert_model.transformer.layer.2.attention.out_lin.bias", "bert_model.transformer.layer.2.sa_layer_norm.weight", "bert_model.transformer.layer.2.sa_layer_norm.bias", "bert_model.transformer.layer.2.ffn.lin1.weight", "bert_model.transformer.layer.2.ffn.lin1.bias", "bert_model.transformer.layer.2.ffn.lin2.weight", "bert_model.transformer.layer.2.ffn.lin2.bias", "bert_model.transformer.layer.2.output_layer_norm.weight", "bert_model.transformer.layer.2.output_layer_norm.bias", "bert_model.transformer.layer.3.attention.q_lin.weight", "bert_model.transformer.layer.3.attention.q_lin.bias", "bert_model.transformer.layer.3.attention.k_lin.weight", "bert_model.transformer.layer.3.attention.k_lin.bias", "bert_model.transformer.layer.3.attention.v_lin.weight", "bert_model.transformer.layer.3.attention.v_lin.bias", "bert_model.transformer.layer.3.attention.out_lin.weight", "bert_model.transformer.layer.3.attention.out_lin.bias", "bert_model.transformer.layer.3.sa_layer_norm.weight", "bert_model.transformer.layer.3.sa_layer_norm.bias", "bert_model.transformer.layer.3.ffn.lin1.weight", "bert_model.transformer.layer.3.ffn.lin1.bias", "bert_model.transformer.layer.3.ffn.lin2.weight", "bert_model.transformer.layer.3.ffn.lin2.bias", "bert_model.transformer.layer.3.output_layer_norm.weight", "bert_model.transformer.layer.3.output_layer_norm.bias", "bert_model.transformer.layer.4.attention.q_lin.weight", "bert_model.transformer.layer.4.attention.q_lin.bias", "bert_model.transformer.layer.4.attention.k_lin.weight", "bert_model.transformer.layer.4.attention.k_lin.bias", "bert_model.transformer.layer.4.attention.v_lin.weight", "bert_model.transformer.layer.4.attention.v_lin.bias", "bert_model.transformer.layer.4.attention.out_lin.weight", "bert_model.transformer.layer.4.attention.out_lin.bias", "bert_model.transformer.layer.4.sa_layer_norm.weight", "bert_model.transformer.layer.4.sa_layer_norm.bias", "bert_model.transformer.layer.4.ffn.lin1.weight", "bert_model.transformer.layer.4.ffn.lin1.bias", "bert_model.transformer.layer.4.ffn.lin2.weight", "bert_model.transformer.layer.4.ffn.lin2.bias", "bert_model.transformer.layer.4.output_layer_norm.weight", "bert_model.transformer.layer.4.output_layer_norm.bias", "bert_model.transformer.layer.5.attention.q_lin.weight", "bert_model.transformer.layer.5.attention.q_lin.bias", "bert_model.transformer.layer.5.attention.k_lin.weight", "bert_model.transformer.layer.5.attention.k_lin.bias", "bert_model.transformer.layer.5.attention.v_lin.weight", "bert_model.transformer.layer.5.attention.v_lin.bias", "bert_model.transformer.layer.5.attention.out_lin.weight", "bert_model.transformer.layer.5.attention.out_lin.bias", "bert_model.transformer.layer.5.sa_layer_norm.weight", "bert_model.transformer.layer.5.sa_layer_norm.bias", "bert_model.transformer.layer.5.ffn.lin1.weight", "bert_model.transformer.layer.5.ffn.lin1.bias", "bert_model.transformer.layer.5.ffn.lin2.weight", "bert_model.transformer.layer.5.ffn.lin2.bias", "bert_model.transformer.layer.5.output_layer_norm.weight", "bert_model.transformer.layer.5.output_layer_norm.bias", "arc_margin.weight". 