In [15]:
import numpy as np
import pandas as pd
import re
from collections import OrderedDict

In [5]:
text = (
       'Hello, how are you? I am Romeo.n'
       'Hello, Romeo My name is Juliet. Nice to meet you.n'
       'Nice meet you too. How are you today?n'
       'Great. My baseball team won the competition.n'
       'Oh Congratulations, Julietn'
       'Thanks you Romeo'
   )
text

'Hello, how are you? I am Romeo.nHello, Romeo My name is Juliet. Nice to meet you.nNice meet you too. How are you today?nGreat. My baseball team won the competition.nOh Congratulations, JulietnThanks you Romeo'

In [16]:
sentences = re.sub("[.,!?-]", '', text.lower()).split('n')  # filter '.', ',', '?', '!'
print(sentences)
print()
word_list = list(OrderedDict.fromkeys(" ".join(sentences).split()))
print(word_list)

['hello how are you i am romeo', 'hello romeo my ', 'ame is juliet ', 'ice to meet you', '', 'ice meet you too how are you today', 'great my baseball team wo', ' the competitio', '', 'oh co', 'gratulatio', 's juliet', 'tha', 'ks you romeo']

['hello', 'how', 'are', 'you', 'i', 'am', 'romeo', 'my', 'ame', 'is', 'juliet', 'ice', 'to', 'meet', 'too', 'today', 'great', 'baseball', 'team', 'wo', 'the', 'competitio', 'oh', 'co', 'gratulatio', 's', 'tha', 'ks']


In [36]:
from transformers import AutoTokenizer
def token_embedding(sentences):
    # INPUT_IDS: are the indices corresponding to each token in the sentence.
    # ATTENTION_MASK: indicates whether a token should be attended to or not.
    # TOKEN_TYPE_IDS: identifies which sequence a token belongs to when there is more than one sequence.
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    # print(encoded_input)
    return encoded_input

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
token_embedding(batch_sentences)
# tokenizer.decode(encoded_input["input_ids"][0])

{'input_ids': tensor([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,  6462,
           117, 21902,  1643,   119,   102],
        [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


'[CLS] But what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
#Load the pretrained model 
bert = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def make_batch():
   batch = []
   positive = negative = 0
   while positive != batch_size/2 or negative != batch_size/2:
       tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences))

       tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]

       input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
       segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

       # MASK LM
       n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
       cand_maked_pos = [i for i, token in enumerate(input_ids)
                         if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
       shuffle(cand_maked_pos)
       masked_tokens, masked_pos = [], []
       for pos in cand_maked_pos[:n_pred]:
           masked_pos.append(pos)
           masked_tokens.append(input_ids[pos])
           if random() < 0.8:  # 80%
               input_ids[pos] = word_dict['[MASK]'] # make mask
           elif random() < 0.5:  # 10%
               index = randint(0, vocab_size - 1) # random index in vocabulary
               input_ids[pos] = word_dict[number_dict[index]] # replace

       # Zero Paddings
       n_pad = maxlen - len(input_ids)
       input_ids.extend([0] * n_pad)
       segment_ids.extend([0] * n_pad)

       # Zero Padding (100% - 15%) tokens
       if max_pred > n_pred:
           n_pad = max_pred - n_pred
           masked_tokens.extend([0] * n_pad)
           masked_pos.extend([0] * n_pad)

       if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
           batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
           positive += 1
       elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
           batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
           negative += 1
   return batch