In [219]:
!pip install transformers
!git lfs install

Updated Git hooks.
Git LFS initialized.


In [220]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
model = BertForMaskedLM.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [221]:
with open('training.txt', 'r') as fp:
    text = fp.read().split('\n')

In [222]:
with open('labels.txt', 'r') as lp:
    text_labels = lp.read().split('\n')

In [223]:
text_labels[:5]

['El presidente contó en la entrevista que cuando era joven, lo quisieron aprehender a su padre, diciéndole «"No fue de la nada, señor, me voy a casar con una buena mujer"». En la entrevista con la cadena BBC, habló de su familia desde muy niño.En marzo de 2009, anunció su divorcio de sus dos hermanas',
 'El presidente contó en la entrevista que cuando era joven, lo quisieron aprehender por su belleza. En su lugar, el presidente quiso a su hija que fuera una de las mejores candidatas a la Casa Blanca. No se sabe si este interés terminó y terminó siendo un tema durante la elección del presidente estadounidense, sino por el éxito de un avión',
 'El presidente contó en la entrevista que cuando era joven, lo quisieron aprehender en un café y en ese momento cuando le dice a su esposa que la quiere para tener sexo con él, ella se lo interpone, dice que no le iba a darle dinero y que ella quería lo último, él le dice que lo había prometido',
 'El presidente contó en la entrevista que cuando e

In [224]:
text[:5]

['El presidente contó en la entrevista que cuando era joven, lo quisieron [MASK] a su padre, diciéndole «"No fue de la nada, señor, me voy a casar con una buena mujer"». En la entrevista con la cadena BBC, habló de su familia desde muy niño.En marzo de 2009, anunció su divorcio de sus dos hermanas',
 'El presidente contó en la entrevista que cuando era joven, lo quisieron [MASK] por su belleza. En su lugar, el presidente quiso a su hija que fuera una de las mejores candidatas a la Casa Blanca. No se sabe si este interés terminó y terminó siendo un tema durante la elección del presidente estadounidense, sino por el éxito de un avión',
 'El presidente contó en la entrevista que cuando era joven, lo quisieron [MASK] en un café y en ese momento cuando le dice a su esposa que la quiere para tener sexo con él, ella se lo interpone, dice que no le iba a darle dinero y que ella quería lo último, él le dice que lo había prometido',
 'El presidente contó en la entrevista que cuando era joven, lo

In [225]:
# Let's increase the vocabulary of Bert model and tokenizer
new_tokens = ["aprehender", "deshecho"]
num_added_toks = tokenizer.add_tokens(new_tokens)
print('We have added', num_added_toks, 'tokens')# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))

We have added 2 tokens


Embedding(31004, 768)

In [226]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [227]:
inputs.input_ids[0]

tensor([    4,  1162,  3599,  9406,  1036,  1030,  9200,  1038,  1486,  1538,
         3762,  1017,  1114, 11733,  1637,     0,  1013,  1079,  2210,  1017,
        29615,     3,  1117,  1125,  1341,  1008,  1030,  1671,  1017,  1985,
         1017,  1129,  2113,  1013, 14474,  1051,  1108,  2667,  1729,  1117,
            3,  1009,  1187,  1030,  9200,  1051,  1030,  7343, 20658,  1017,
        10470,  1008,  1079,  2268,  1668,  1456,  3330,  1009,  1187,  3008,
         1008,  3293,  1017,  8838,  1079, 13379,  1008,  1287,  1471, 11145,
            5,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [228]:
inputs_labels = tokenizer(text_labels, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [229]:
inputs_labels.input_ids[0]

tensor([    4,  1162,  3599,  9406,  1036,  1030,  9200,  1038,  1486,  1538,
         3762,  1017,  1114, 11733,  1637, 31002,  1013,  1079,  2210,  1017,
        29615,     3,  1117,  1125,  1341,  1008,  1030,  1671,  1017,  1985,
         1017,  1129,  2113,  1013, 14474,  1051,  1108,  2667,  1729,  1117,
            3,  1009,  1187,  1030,  9200,  1051,  1030,  7343, 20658,  1017,
        10470,  1008,  1079,  2268,  1668,  1456,  3330,  1009,  1187,  3008,
         1008,  3293,  1017,  8838,  1079, 13379,  1008,  1287,  1471, 11145,
            5,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [232]:
inputs

{'input_ids': tensor([[    4,  1162,  3599,  ...,     1,     1,     1],
        [    4,  1162,  3599,  ...,     1,     1,     1],
        [    4,  1162,  3599,  ...,     1,     1,     1],
        ...,
        [    4,  1162, 11476,  ...,     1,     1,     1],
        [    4,  1162, 11476,  ...,     1,     1,     1],
        [    4,  1162, 11476,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [233]:
inputs['labels'] = inputs_labels.input_ids

In [234]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [235]:
rand = torch.rand(inputs.input_ids.shape)
# create mask array

mask_arr = (rand) * (inputs.input_ids != 4) * \
           (inputs.input_ids != 5) * (inputs.input_ids == 0)

In [236]:
rand.shape

torch.Size([559, 512])

In [237]:
mask_arr = mask_arr > 0

In [238]:
mask_arr[1].nonzero()

tensor([[15]])

In [239]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [240]:
selection

[[15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11, 27],
 [11],
 [11],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [13],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [7],
 [7],
 [7],
 [7, 52],
 [7],
 [7],
 [7],
 [7],
 [7],
 [7],
 [7, 15],
 [7],
 [7],
 [7],
 [7],
 [7],
 [7],
 [7],
 [7],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [3],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [15],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [11],
 [1

In [241]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 0

In [242]:
inputs.input_ids[0]

tensor([    4,  1162,  3599,  9406,  1036,  1030,  9200,  1038,  1486,  1538,
         3762,  1017,  1114, 11733,  1637,     0,  1013,  1079,  2210,  1017,
        29615,     3,  1117,  1125,  1341,  1008,  1030,  1671,  1017,  1985,
         1017,  1129,  2113,  1013, 14474,  1051,  1108,  2667,  1729,  1117,
            3,  1009,  1187,  1030,  9200,  1051,  1030,  7343, 20658,  1017,
        10470,  1008,  1079,  2268,  1668,  1456,  3330,  1009,  1187,  3008,
         1008,  3293,  1017,  8838,  1079, 13379,  1008,  1287,  1471, 11145,
            5,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

In [243]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [244]:


dataset = MeditationsDataset(inputs)



In [245]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [246]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31004, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
           

In [247]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)



In [248]:
from tqdm import tqdm  # for our progress bar

epochs = 100

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 35/35 [1:30:32<00:00, 155.21s/it, loss=0.0575]
Epoch 1: 100%|██████████| 35/35 [1:30:20<00:00, 154.88s/it, loss=0.0103] 
Epoch 2: 100%|██████████| 35/35 [1:30:12<00:00, 154.64s/it, loss=0.00115]
Epoch 3: 100%|██████████| 35/35 [1:30:01<00:00, 154.32s/it, loss=0.00266] 
Epoch 4: 100%|██████████| 35/35 [1:29:57<00:00, 154.21s/it, loss=0.00235] 
Epoch 5: 100%|██████████| 35/35 [1:30:04<00:00, 154.41s/it, loss=0.000925]
Epoch 6:   6%|▌         | 2/35 [05:39<1:33:18, 169.66s/it, loss=0.000364]


KeyboardInterrupt: 

In [21]:
from huggingface_hub import notebook_login
# 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [205]:
!git config --global user.email "doms1369@gmail.com"
!git config --global user.name "Diego Mejia"

In [206]:

%env PATH=/Users/doms/opt/anaconda3/bin:/Users/doms/opt/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin

env: PATH=/Users/doms/opt/anaconda3/bin:/Users/doms/opt/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin


In [207]:
!echo $PATH 

/Users/doms/opt/anaconda3/bin:/Users/doms/opt/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin


In [214]:
!git init
!git remote add "https://huggingface.co/Diegomejia/bert-ucb-v2"

Reinitialized existing Git repository in /Users/doms/.git/
usage: git remote add [<options>] <name> <url>

    -f, --fetch           fetch the remote branches
    --tags                import all tags and associated objects when fetching
                          or do not fetch any tag at all (--no-tags)
    -t, --track <branch>  branch(es) to track
    -m, --master <branch>
                          master branch
    --mirror[=(push|fetch)]
                          set up remote as a mirror to push to or fetch from



In [217]:
model.push_to_hub("Diegomejia/bert-ucb-v2")

Cloning https://huggingface.co/Diegomejia/bert-ucb-v1 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/419M [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/Diegomejia/bert-ucb-v1
   3bcb72b..07e0105  main -> main



'https://huggingface.co/Diegomejia/bert-ucb-v1/commit/07e0105209279bc20edacdb0133c222944bcd501'

In [218]:
tokenizer.push_to_hub("Diegomejia/bert-ucb-v2")

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/Diegomejia/bert-ucb-v1
   07e0105..7b098c7  main -> main



'https://huggingface.co/Diegomejia/bert-ucb-v1/commit/7b098c72af132e0a7eb51b893f1d5383246817f8'