In [1]:
from ipywidgets import FloatProgress

In [2]:
!wandb login --relogin b1dd9c8326832dc2f1eb1b635a44f5428cd40b85

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
import wandb

wandb.init(project="lentera", entity="kata-research")

[34m[1mwandb[0m: Currently logged in as: [33mkata-research[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Helper Functions
import copy
from collections import OrderedDict

def updateModel(model, optimizer, freeze=False, mode="double"):
  """
    model     : a Pytorch Model
    optimizer : a Pytorch optimizer
    freeze    : Should the model freeze all of the previous encoder layers? Default = False
    mode      : How should layers be added?
                1. "double" (Default) --> Double the encoder layer
                2. "mean_double"      --> Double the encoder layer, but each added layers are the mean of the previous layers
                3. "mean_single"      --> Add a single layer, with it weights being the mean of the previous layers
  """

  # Copy bert encoder layers
  unfrozen = copy.deepcopy(model.bert.encoder.layer)

  # What is the mode?
  if ("mean" in mode.split('_')):
    unfrozen = single_layer(unfrozen, "mean")
    
  # Freeze layers that have been trained
  if (freeze):
    for params in model.bert.encoder.layer.parameters():
        params.requires_grad = False
  

  # Add layers of encoder to the model
  if (mode == "double"):
    for i in range(len(model.bert.encoder.layer)):
      model.bert.encoder.layer.append(unfrozen[i])
  elif ("single" in mode):
    model.bert.encoder.layer.append(unfrozen)
  elif ("double" in mode):
    for i in range(len(model.bert.encoder.layer)):
      model.bert.encoder.layer.append(unfrozen)
  
  new_optimizer = AdamW(model.parameters(), lr=5e-5)
  return model, new_optimizer

def single_layer(layer, mode):
  sdUnfrozen = unfrozen.state_dict()

  sum_layer = OrderedDict()
  done_queries = []
  final_layer = OrderedDict()
  mode = 'mean'
  for key in sdUnfrozen:
    q = '.'.join(key.split('.')[1:])
    if (q not in done_queries):
      done_queries.append(q)
      lay_num = 0
      for key in sdUnfrozen:
        if ('.'.join(key.split('.')[1:]) == q):
          if (lay_num == 0):
            sum_layer[q] = copy.deepcopy(sdUnfrozen[key])
          else:
            sum_layer[q] += sdUnfrozen[key]
          lay_num += 1
      if (mode == 'mean'):
        final_layer[q] = copy.deepcopy(sum_layer[q])/(lay_num)

  singular_unfrozen = unfrozen[0]
  singular_unfrozen.load_state_dict(final_layer)

  for params in singular_unfrozen.parameters():
    params.requires_grad = True
  
  return singular_unfrozen


In [14]:
# Encoding Helpers
def encode_with_truncation(examples, max_length = 512):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer(examples["text"], truncation=True, return_special_tokens_mask=True)

def encode_without_truncation(examples):
    """Mapping function to tokenize the sentences passed without truncation"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from tqdm.auto import tqdm

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

# Load Dataset
DATA_PATH = "data/first100.txt"
text_dataset = load_dataset("text", data_files=DATA_PATH)['train']

# Get Models and Tokenizer
CHECKPOINT = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model_conf = BertForMaskedLM.from_pretrained(CHECKPOINT).config
model = BertForMaskedLM(model_conf)

# Set Model to 3 Encoder Layer
for i in range(11, 2, -1):
  del model.bert.encoder.layer[i]

# Tokenize Dataset
tokenized_dataset = text_dataset.map(encode_with_truncation, batched=True)

# Set Devices
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Set Up Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Set up DataLoader
train_batch_size = 1
train_dataset = tokenized_dataset.remove_columns(['text'])
train_dataloader = DataLoader(train_dataset, batch_size = train_batch_size, collate_fn = data_collator, shuffle=True)

# Preparing Tensors
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set Up Constants (Insert scheduler here)
num_epoch = 1
accum_iter = 8
num_training_steps = int(num_epoch / accum_iter * len(train_dataset))

# Set Up When To Update Model
update_steps = [
    int(num_training_steps * 0.1),
    int(num_training_steps * 0.25)
]
print(f"Model will be updated at {update_steps}")

# Main Training Loop
print("Training: ")
progress_bar = tqdm(range(num_training_steps))
step_num = 0
model.train()
# Main Training Loop
print("Training: ")
print(f"Train Total Batch Size = {accum_iter * train_batch_size}")
print(f"Number of steps to be taken = {num_training_steps}")
progress_bar = tqdm(range(num_training_steps))
step_num = 0
actual_step = 0
sum_train_loss = 0
model.train()
for epoch in range(num_epoch):
    for (i, batch) in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        sum_train_loss += loss.item()

        # Gradient Accumulation
        if (((i % accum_iter) == 0) or (i + 1 == len(train_dataloader))):
            optimizer.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            step_num += 1
            
            if (step_num in update_steps):
                print("Updating Model:")
                print(f"Previously at {len(model.bert.encoder.layer)}")
                model, optimizer = updateModel(model, optimizer, freeze=False, mode="double")
                print(f"Currently at {len(model.bert.encoder.layer)}")
            

        wandb.log({"loss": loss})
        
model_name = "progressive-test"
SAVE_PATH = f"model/{model_name}"
model.save_pretrained(SAVE_PATH)


Using custom data configuration default-1fdf1ccf73c8b82a
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-1fdf1ccf73c8b82a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-1fdf1ccf73c8b82a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-d724efc30355e89d.arrow


Model will be updated at [1, 3]
Training: 


  0%|          | 0/12 [00:00<?, ?it/s]

14it [00:45,  3.28s/it]

Training: 
Train Total Batch Size = 8
Number of steps to be taken = 12





  0%|          | 0/12 [00:00<?, ?it/s]

Updating Model:
Previously at 3
Currently at 6
Updating Model:
Previously at 6
Currently at 12
