In [18]:
!git clone https://github.com/UKPLab/sentence-transformers.git

Cloning into 'sentence-transformers'...
remote: Enumerating objects: 4679, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (166/166), done.[K
remote: Total 4679 (delta 114), reused 145 (delta 58), pack-reused 4455[K
Receiving objects: 100% (4679/4679), 15.97 MiB | 17.34 MiB/s, done.
Resolving deltas: 100% (3209/3209), done.


In [1]:
import torch
import sys
import logging

sys.path.append("../KoBERT")
from kobert.pytorch_kobert import get_pytorch_kobert_model
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
import transformers

# Model, Tokenizer load

In [2]:
logging.info("load model and tokenizer")
# model, vocab load
model, vocab  = get_pytorch_kobert_model()

# tokenizer load
tok_path = get_tokenizer()
sp  = SentencepieceTokenizer(tok_path)

using cached model
using cached model
using cached model


# 학습 데이터 load

In [3]:
from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader
import logging
import pandas as pd

In [5]:
logging.info("Read AllNLI train dataset")

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []

train_df = pd.read_csv('../data/KorNLUDatasets/KorNLI/snli_1.0_train.ko.tsv', sep='\t', encoding="utf-8")

for s1, s2, labels in zip(train_df['sentence1'], train_df['sentence2'], train_df['gold_label']):
    label = label2int[labels.strip()]
    train_samples.append(InputExample(texts=[s1, s2], label=label))

In [6]:
vocab.token_to_idx['[SEP]']

3

In [7]:
# NLI Dataset 선언
class SentencesDataset(torch.utils.data.Dataset):
    def __init__(self, examples, tokenizer, vocab, max_seq_length):
        '''
        examples : List[InputExample]
        tokenizer : SentencepieceTokenizer
        vocab : vocab module
        max_seq_length : max sequence length.
        '''
        self.examples = examples
        self.label_type = torch.long if isinstance(self.examples[0].label, int) else torch.float
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_seq_length = max_seq_length
    
    def __getitem__(self, idx):
        item = dict()
        # tokenizing
        s1, s2 = [self.tokenizer(text) for text in self.examples[idx].texts]
        
        # slicing(consider special token)
        s1 = s1[:self.max_seq_length-2] if len(s1) > self.max_seq_length-2 else s1
        s2 = s2[:self.max_seq_length-2] if len(s2) > self.max_seq_length-2 else s2
        
        # add special token
        s1 = ['[CLS]'] + s1 + ['[SEP]']
        s2 = ['[CLS]'] + s2 + ['[SEP]']
        
        # token to index
        s1 = [vocab.token_to_idx[x] for x in s1]
        s2 = [vocab.token_to_idx[x] for x in s2]
        
        # add padding
        s1_ = s1 + [0]*(self.max_seq_length - len(s1)) if len(s1) < self.max_seq_length else s1[:self.max_seq_length]
        s2_ = s2 + [0]*(self.max_seq_length - len(s2)) if len(s2) < self.max_seq_length else s2[:self.max_seq_length]
        
        item['input_ids1'] = torch.tensor(s1_)
        item['input_ids2'] = torch.tensor(s2_)
        
        # attention mask 
        attention_mask1 = [1] * len(s1) + [0] * (self.max_seq_length - len(s1)) if len(s1) < self.max_seq_length else [1] * self.max_seq_length
        attention_mask2 = [1] * len(s2) + [0] * (self.max_seq_length - len(s2)) if len(s2) < self.max_seq_length else [1] * self.max_seq_length
        
        item['attention_mask1'] = torch.tensor(attention_mask1)
        item['attention_mask2'] = torch.tensor(attention_mask2)
        
        # label
        item['label'] = torch.tensor(self.examples[idx].label, dtype=self.label_type) 
        
        item['token_type_ids1'] = torch.tensor(self.max_seq_length * [0])
        item['token_type_ids2'] = torch.tensor(self.max_seq_length * [0])
        return item
            
    def __len__(self):
        return len(self.examples)

In [8]:
train_dataset = SentencesDataset(train_samples, sp, vocab, 30)
train_dataset[0]

{'input_ids1': tensor([   2, 1962, 4707, 2589,  993, 7178, 5663, 2545, 3554, 1868, 6964, 6115,
         5782,   54,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]),
 'input_ids2': tensor([   2, 4955, 2589,  970, 7088, 3567, 1962, 5184, 6723, 3862,   54,    3,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]),
 'attention_mask1': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'attention_mask2': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'label': tensor(2),
 'token_type_ids1': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'token_type_ids2': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0])}

In [9]:
train_batch_size = 2
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [10]:
next(iter(train_dataloader))

{'input_ids1': tensor([[   2, 4501, 6083, 6538,  517, 6601, 7478, 6983, 4799, 6016, 6538, 2207,
          6273, 7329, 3843, 4073, 3313, 7096, 3454, 7078,  745, 4425, 3006, 6093,
          7748, 4636, 6116, 1802, 4955,    3],
         [   2, 2586, 4303, 5330, 5760, 2369, 3552, 6896, 1773, 2038,  976, 7096,
          2718, 3862,   54,    3,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0]]),
 'input_ids2': tensor([[   2, 4073, 3313, 7086, 1495, 2207, 6273, 7318, 6896, 4799, 6016,  517,
          6601, 7478, 6116, 3838, 3862,    3,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0],
         [   2, 1487, 7096, 1958, 7788, 3873,    3,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0]]),
 'attention_mask1': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1

In [11]:
# SoftmaxLoss(NLI datasets)
class SoftmaxLoss(torch.nn.Module):
    """
    Reference
    https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/losses/SoftmaxLoss.py
    
    This loss was used in our SBERT publication (https://arxiv.org/abs/1908.10084) to train the SentenceTransformer
    model on NLI data. It adds a softmax classifier on top of the output of two transformer networks.

    :param model: pretrained model
    :param sentence_embedding_dimension: Dimension of your sentence embeddings
    :param num_labels: Number of different labels
    :param concatenation_sent_rep: Concatenate vectors u,v for the softmax classifier?
    :param concatenation_sent_difference: Add abs(u-v) for the softmax classifier?
    :param concatenation_sent_multiplication: Add u*v for the softmax classifier?
    
    """
    def __init__(self, model, sentence_embedding_dimension, num_labels, concatenation_sent_rep, concatenation_sent_difference, concatenation_sent_multiplication):
        super(SoftmaxLoss, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.concatenation_sent_rep = concatenation_sent_rep
        self.concatenation_sent_difference = concatenation_sent_difference
        self.concatenation_sent_multiplication = concatenation_sent_multiplication
        
        num_vectors_concatenated = 0
        if concatenation_sent_rep:
            num_vectors_concatenated += 2
        if concatenation_sent_difference:
            num_vectors_concatenated += 1
        if concatenation_sent_multiplication:
            num_vectors_concatenated += 1
        logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
        self.classifier = torch.nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
    
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    def forward(self, x):
        # input_ids1, attention_mask1, token_type_ids1, input_ids2, attention_mask2, token_type_ids2, labels
        output1 = self.model(input_ids=x['input_ids1'], attention_mask=x['attention_mask1'], token_type_ids=x['token_type_ids1'])
        output2 = self.model(input_ids=x['input_ids2'], attention_mask=x['attention_mask2'], token_type_ids=x['token_type_ids2'])
        
        rep_a = self.mean_pooling(output1[0], x['attention_mask1'])
        rep_b = self.mean_pooling(output2[0], x['attention_mask2'])
        
        vectors_concat = []
        if self.concatenation_sent_rep:
            vectors_concat.append(rep_a)
            vectors_concat.append(rep_b)

        if self.concatenation_sent_difference:
            vectors_concat.append(torch.abs(rep_a - rep_b))

        if self.concatenation_sent_multiplication:
            vectors_concat.append(rep_a * rep_b)

        features = torch.cat(vectors_concat, 1)
        
        output = self.classifier(features)
        loss_fct = torch.nn.CrossEntropyLoss()
        
        if labels is not None:
            loss = loss_fct(output, x['label'].view(-1))
            return loss
        else:
            return reps, output

# NLI 학습 및 검증

In [13]:
# NLI dataset load
logging.info("Read AllNLI train dataset")

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []

train_df = pd.read_csv('../data/KorNLUDatasets/KorNLI/snli_1.0_train.ko.tsv', sep='\t', encoding="utf-8")

for s1, s2, labels in zip(train_df['sentence1'], train_df['sentence2'], train_df['gold_label']):
    label = label2int[labels.strip()]
    train_samples.append(InputExample(texts=[s1, s2], label=label))


batch_size = 16
max_sequence_length = 64

# train dataset Load
train_dataset = SentencesDataset(train_samples, sp, vocab, max_sequence_length)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [14]:
# validation dataset load
logging.info("Read STSbenchmark dev dataset")
dev_samples = []

with open('../data/KorNLUDatasets/KorSTS/tune_dev.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

In [15]:
dev_dataset = SentencesDataset(dev_samples, sp, vocab, max_sequence_length)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)

# 학습

In [16]:
batch_size = 16
num_epochs = 1
weight_decay = 0.01

num_training_steps = int(len(train_dataloader) * num_epochs)

import math
# model 선언
SBERT_model = SoftmaxLoss(model, 768, 3, True, True, False)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

param_optimizer = list(SBERT_model.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


# optimizer 선언
from transformers import AdamW

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-6, correct_bias=False)
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, warmup_steps, num_training_steps=num_training_steps)


# NLI dataset 학습
from tqdm import tqdm
from tqdm.autonotebook import trange

# 기록을 위해 tensorboard 사용
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(f'runs/{num_epochs}_{batch_size}')

global_steps = 0
running_loss = 0
for epoch in trange(num_epochs, desc='Epoch', disable=not True):
    training_steps = 0
    
    for batch in tqdm(train_dataloader, desc='Iteration', disable=False):
        SBERT_model.zero_grad()
        SBERT_model.train()
        SBERT_model.to(device)
        
        batch = {key: batch[key].to(device) for key in batch.keys()}
        
        loss_value = SBERT_model(batch)
        loss_value.backward()
        torch.nn.utils.clip_grad_norm(SBERT_model.parameters(), 1)
        optimizer.step()
        scheduler.step()
        
        running_loss += loss_value.item()
        
        if training_steps % 1000 == 999:
            print(f'training_loss : {running_loss / 1000:.4f}, epochs: {epoch}, training_steps: {training_steps}')
            writer.add_scalar('training_loss', running_loss / 1000, training_steps + epoch * len(train_dataloader))
            torch.save(SBERT_model, f'./trained_model/{epoch}_{training_steps}_{running_loss/1000:.4f}.pt')
            running_loss = 0
        
        training_steps += 1
    global_steps += 1

HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

  torch.nn.utils.clip_grad_norm(SBERT_model.parameters(), 1)
Iteration:   0%|          | 998/275076 [01:30<6:40:36, 11.40it/s]

training_loss : 1.0999, epochs: 0, training_steps: 999


Iteration:   1%|          | 1999/275076 [03:11<8:23:20,  9.04it/s] 

training_loss : 1.0912, epochs: 0, training_steps: 1999


Iteration:   1%|          | 2999/275076 [04:52<8:24:41,  8.98it/s] 

training_loss : 1.0596, epochs: 0, training_steps: 2999


Iteration:   1%|▏         | 3999/275076 [06:40<9:47:27,  7.69it/s] 

training_loss : 1.0278, epochs: 0, training_steps: 3999


Iteration:   2%|▏         | 4999/275076 [08:28<8:05:25,  9.27it/s] 

training_loss : 0.9953, epochs: 0, training_steps: 4999


Iteration:   2%|▏         | 5999/275076 [09:59<8:07:15,  9.20it/s] 

training_loss : 0.9535, epochs: 0, training_steps: 5999


Iteration:   3%|▎         | 6998/275076 [11:29<6:38:38, 11.21it/s] 

training_loss : 0.9746, epochs: 0, training_steps: 6999


Iteration:   3%|▎         | 7998/275076 [12:58<6:33:37, 11.31it/s] 

training_loss : 0.9439, epochs: 0, training_steps: 7999


Iteration:   3%|▎         | 8999/275076 [14:48<8:09:44,  9.05it/s] 

training_loss : 0.9500, epochs: 0, training_steps: 8999


Iteration:   4%|▎         | 9998/275076 [16:17<6:30:36, 11.31it/s] 

training_loss : 0.9255, epochs: 0, training_steps: 9999


Iteration:   4%|▍         | 10998/275076 [17:46<6:29:08, 11.31it/s] 

training_loss : 0.9039, epochs: 0, training_steps: 10999


Iteration:   4%|▍         | 11999/275076 [19:16<6:25:50, 11.36it/s] 

training_loss : 0.9045, epochs: 0, training_steps: 11999


Iteration:   5%|▍         | 12999/275076 [20:51<7:57:52,  9.14it/s] 

training_loss : 0.9094, epochs: 0, training_steps: 12999


Iteration:   5%|▌         | 13998/275076 [22:20<6:24:49, 11.31it/s] 

training_loss : 0.9272, epochs: 0, training_steps: 13999


Iteration:   5%|▌         | 14999/275076 [24:05<7:57:24,  9.08it/s] 

training_loss : 0.9158, epochs: 0, training_steps: 14999


Iteration:   6%|▌         | 15998/275076 [25:35<6:22:52, 11.28it/s] 

training_loss : 0.9374, epochs: 0, training_steps: 15999


Iteration:   6%|▌         | 16998/275076 [27:04<6:21:51, 11.26it/s] 

training_loss : 0.9024, epochs: 0, training_steps: 16999


Iteration:   7%|▋         | 17998/275076 [28:34<6:22:23, 11.20it/s] 

training_loss : 0.9536, epochs: 0, training_steps: 17999


Iteration:   7%|▋         | 18999/275076 [30:16<9:08:18,  7.78it/s] 

training_loss : 0.8780, epochs: 0, training_steps: 18999


Iteration:   7%|▋         | 19999/275076 [32:03<7:44:02,  9.16it/s] 

training_loss : 0.9483, epochs: 0, training_steps: 19999


Iteration:   8%|▊         | 20999/275076 [33:52<7:48:43,  9.03it/s] 

training_loss : 0.9128, epochs: 0, training_steps: 20999


Iteration:   8%|▊         | 21999/275076 [35:22<6:13:40, 11.29it/s] 

training_loss : 0.9449, epochs: 0, training_steps: 21999


Iteration:   8%|▊         | 22999/275076 [37:09<7:40:29,  9.12it/s] 

training_loss : 0.8920, epochs: 0, training_steps: 22999


Iteration:   9%|▊         | 23999/275076 [38:41<7:39:09,  9.11it/s] 

training_loss : 0.9144, epochs: 0, training_steps: 23999


Iteration:   9%|▉         | 24998/275076 [40:10<6:13:56, 11.15it/s] 

training_loss : 0.9070, epochs: 0, training_steps: 24999


Iteration:   9%|▉         | 25999/275076 [41:55<7:36:32,  9.09it/s] 

training_loss : 0.9151, epochs: 0, training_steps: 25999


Iteration:  10%|▉         | 26999/275076 [43:30<7:27:29,  9.24it/s] 

training_loss : 0.9389, epochs: 0, training_steps: 26999


Iteration:  10%|█         | 27999/275076 [45:00<8:27:46,  8.11it/s] 

training_loss : 0.9612, epochs: 0, training_steps: 27999


Iteration:  11%|█         | 28999/275076 [46:47<7:26:40,  9.18it/s] 

training_loss : 0.9468, epochs: 0, training_steps: 28999


Iteration:  11%|█         | 29998/275076 [48:17<6:04:20, 11.21it/s] 

training_loss : 0.9496, epochs: 0, training_steps: 29999


Iteration:  11%|█▏        | 30999/275076 [49:55<8:08:59,  8.32it/s] 

training_loss : 0.9310, epochs: 0, training_steps: 30999


Iteration:  12%|█▏        | 31999/275076 [51:35<6:40:09, 10.12it/s] 

training_loss : 0.9108, epochs: 0, training_steps: 31999


Iteration:  12%|█▏        | 32998/275076 [53:15<6:35:28, 10.20it/s] 

training_loss : 0.9187, epochs: 0, training_steps: 32999


Iteration:  12%|█▏        | 33998/275076 [54:49<5:52:46, 11.39it/s] 

training_loss : 0.9352, epochs: 0, training_steps: 33999


Iteration:  13%|█▎        | 34998/275076 [56:19<5:56:37, 11.22it/s] 

training_loss : 0.9120, epochs: 0, training_steps: 34999


Iteration:  13%|█▎        | 35998/275076 [57:48<5:54:19, 11.25it/s] 

training_loss : 0.9221, epochs: 0, training_steps: 35999


Iteration:  13%|█▎        | 36999/275076 [59:22<6:39:46,  9.93it/s] 

training_loss : 0.9276, epochs: 0, training_steps: 36999


Iteration:  14%|█▍        | 37998/275076 [1:01:01<6:31:41, 10.09it/s]

training_loss : 0.9216, epochs: 0, training_steps: 37999


Iteration:  14%|█▍        | 38998/275076 [1:02:33<5:49:46, 11.25it/s] 

training_loss : 0.9001, epochs: 0, training_steps: 38999


Iteration:  15%|█▍        | 39998/275076 [1:04:02<5:45:11, 11.35it/s] 

training_loss : 0.9002, epochs: 0, training_steps: 39999


Iteration:  15%|█▍        | 40998/275076 [1:05:31<5:45:34, 11.29it/s] 

training_loss : 0.9161, epochs: 0, training_steps: 40999


Iteration:  15%|█▌        | 41999/275076 [1:07:10<6:53:03,  9.40it/s] 

training_loss : 0.9026, epochs: 0, training_steps: 41999


Iteration:  16%|█▌        | 42999/275076 [1:08:59<7:09:46,  9.00it/s] 

training_loss : 0.9116, epochs: 0, training_steps: 42999


Iteration:  16%|█▌        | 43998/275076 [1:10:28<5:42:50, 11.23it/s] 

training_loss : 0.9363, epochs: 0, training_steps: 43999


Iteration:  16%|█▋        | 44998/275076 [1:11:58<5:37:39, 11.36it/s] 

training_loss : 0.9124, epochs: 0, training_steps: 44999


Iteration:  17%|█▋        | 45998/275076 [1:13:27<5:42:51, 11.14it/s] 

training_loss : 0.9291, epochs: 0, training_steps: 45999


Iteration:  17%|█▋        | 46998/275076 [1:14:57<5:36:04, 11.31it/s] 

training_loss : 0.8914, epochs: 0, training_steps: 46999


Iteration:  17%|█▋        | 47999/275076 [1:16:26<5:33:22, 11.35it/s] 

training_loss : 0.8941, epochs: 0, training_steps: 47999


Iteration:  18%|█▊        | 48999/275076 [1:17:57<5:37:23, 11.17it/s] 

training_loss : 0.8757, epochs: 0, training_steps: 48999


Iteration:  18%|█▊        | 49999/275076 [1:19:27<5:38:43, 11.07it/s] 

training_loss : 0.8928, epochs: 0, training_steps: 49999


Iteration:  19%|█▊        | 50999/275076 [1:20:56<5:30:01, 11.32it/s] 

training_loss : 0.8810, epochs: 0, training_steps: 50999


Iteration:  19%|█▉        | 51999/275076 [1:22:25<5:31:42, 11.21it/s] 

training_loss : 0.9318, epochs: 0, training_steps: 51999


Iteration:  19%|█▉        | 52999/275076 [1:23:55<5:25:41, 11.36it/s] 

training_loss : 0.9271, epochs: 0, training_steps: 52999


Iteration:  20%|█▉        | 53999/275076 [1:25:24<5:25:41, 11.31it/s] 

training_loss : 0.9125, epochs: 0, training_steps: 53999


Iteration:  20%|█▉        | 54999/275076 [1:26:54<5:28:19, 11.17it/s] 

training_loss : 0.9251, epochs: 0, training_steps: 54999


Iteration:  20%|██        | 55999/275076 [1:28:24<6:38:58,  9.15it/s] 

training_loss : 0.9792, epochs: 0, training_steps: 55999


Iteration:  21%|██        | 56999/275076 [1:30:18<6:34:32,  9.21it/s] 

training_loss : 0.9169, epochs: 0, training_steps: 56999


Iteration:  21%|██        | 57999/275076 [1:31:54<6:34:54,  9.16it/s] 

training_loss : 0.9149, epochs: 0, training_steps: 57999


Iteration:  21%|██▏       | 58999/275076 [1:33:43<6:32:44,  9.17it/s] 

training_loss : 0.9183, epochs: 0, training_steps: 58999


Iteration:  22%|██▏       | 59999/275076 [1:35:17<6:36:26,  9.04it/s] 

training_loss : 0.9104, epochs: 0, training_steps: 59999


Iteration:  22%|██▏       | 60998/275076 [1:36:46<5:16:39, 11.27it/s] 

training_loss : 0.9132, epochs: 0, training_steps: 60999


Iteration:  23%|██▎       | 61999/275076 [1:38:32<7:08:18,  8.29it/s] 

training_loss : 0.8951, epochs: 0, training_steps: 61999


Iteration:  23%|██▎       | 62999/275076 [1:40:23<6:57:05,  8.47it/s] 

training_loss : 0.9041, epochs: 0, training_steps: 62999


Iteration:  23%|██▎       | 63999/275076 [1:42:06<6:25:04,  9.14it/s] 

training_loss : 0.8815, epochs: 0, training_steps: 63999


Iteration:  24%|██▎       | 64999/275076 [1:43:37<6:20:39,  9.20it/s] 

training_loss : 0.9289, epochs: 0, training_steps: 64999


Iteration:  24%|██▍       | 65999/275076 [1:45:18<6:24:46,  9.06it/s] 

training_loss : 0.9204, epochs: 0, training_steps: 65999


Iteration:  24%|██▍       | 66998/275076 [1:46:47<5:08:40, 11.24it/s] 

training_loss : 0.9084, epochs: 0, training_steps: 66999


Iteration:  25%|██▍       | 67998/275076 [1:48:17<5:18:15, 10.84it/s] 

training_loss : 0.9063, epochs: 0, training_steps: 67999


Iteration:  25%|██▌       | 68999/275076 [1:50:13<6:51:50,  8.34it/s] 

training_loss : 0.9097, epochs: 0, training_steps: 68999


Iteration:  25%|██▌       | 69999/275076 [1:52:02<6:15:12,  9.11it/s] 

training_loss : 0.8782, epochs: 0, training_steps: 69999


Iteration:  26%|██▌       | 70998/275076 [1:53:31<5:00:41, 11.31it/s] 

training_loss : 0.9096, epochs: 0, training_steps: 70999


Iteration:  26%|██▌       | 71999/275076 [1:55:07<6:12:57,  9.08it/s] 

training_loss : 0.8794, epochs: 0, training_steps: 71999


Iteration:  27%|██▋       | 72999/275076 [1:56:56<6:02:36,  9.29it/s] 

training_loss : 0.8892, epochs: 0, training_steps: 72999


Iteration:  27%|██▋       | 73999/275076 [1:58:38<6:03:33,  9.22it/s] 

training_loss : 0.8608, epochs: 0, training_steps: 73999


Iteration:  27%|██▋       | 74999/275076 [2:00:23<6:02:03,  9.21it/s] 

training_loss : 0.8838, epochs: 0, training_steps: 74999


Iteration:  28%|██▊       | 75999/275076 [2:02:05<5:59:42,  9.22it/s] 

training_loss : 0.9102, epochs: 0, training_steps: 75999


Iteration:  28%|██▊       | 76321/275076 [2:02:36<5:19:18, 10.37it/s] 







TypeError: not a string

# Test model

In [None]:
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
logging.info('STS dataset ')

In [None]:
next(iter(dev_dataloader))

In [None]:
len(train_dataloader)