In [2]:
cd /content/drive/MyDrive/Code_BabyLM

/content/drive/MyDrive/Code_BabyLM


In [None]:
pip install -r requirements.txt

In [3]:
from transformers import RobertaConfig

In [4]:
global tokenizer_p1, tokenizer_p2, tokenizer_p3
tokenizer_p1 = None
tokenizer_p2 = None
tokenizer_p3 = None

In [20]:
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizer, AutoModelForMaskedLM

def get_tokenizer(phase):

  ########### Paths for all the training files that will be used during training ###########
  ##########################################################################################
  path = "babylm_data/babylm_10M/"
  train_files = [path+"1-aochildes.train", path+"2-qed.train", path+"3-open_subtitles.train",
           path+"4-switchboard.train", path+"5-cbt.train", path+"6-children_stories.train",
           path+"7-gutenberg.train", path+"8-simple_wikipedia.train",
           path+"9-wikipedia.train", path+"99-bnc_spoken.train"]

  global tokenizer_p1, tokenizer_p2, tokenizer_p3

  voc_size = None

  if phase==1:
    if tokenizer_p1 is not None:
      return tokenizer_p1
    voc_size = 5334
  elif phase==2:
    if tokenizer_p2 is not None:
      return tokenizer_p2
    voc_size = 15334
  elif phase==3:
    if tokenizer_p3 is not None:
      return tokenizer_p3
    voc_size = 30334

  tokenizer = ByteLevelBPETokenizer()
  tokenizer.train(
    files=train_files,
    vocab_size=voc_size,
    min_frequency=2,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ]
  )

  ##### Path to save the tokenizer that will be used to create a RobertaTokenizer #####
  #####################################################################################
  tokenizer.save_model(f"tokenizers/customTokenizers/forP{str(phase)}")

  ########## The switch of tokenizer due to the compatibility issues ##########
  #############################################################################
  tokenizer = RobertaTokenizer.from_pretrained(f"tokenizers/customTokenizers/forP{str(phase)}", max_len=512)
  tokenizer.save_pretrained(f"tokenizers/finalTokenizers/forP{str(phase)}")

  if phase==1:
    tokenizer_p1 = tokenizer
  elif phase==2:
    tokenizer_p2 = tokenizer
  elif phase==3:
    tokenizer_p3 = tokenizer
  else:
    assert False, "Phase not provided"


  return tokenizer

In [6]:
def init_model_and_tokenizer(phase):
  tokenizer = None
  if phase == 1:
    tokenizer = get_tokenizer(1)
    config = RobertaConfig(hidden_size=192, intermediate_size=768, num_of_attention_heads=3, num_hidden_layers=3, vocab_size=len(tokenizer.get_vocab()))
  elif phase == 2:
    tokenizer = get_tokenizer(2)
    config = RobertaConfig(hidden_size=384, intermediate_size=1536, num_of_attention_heads=6, num_hidden_layers=6, vocab_size=len(tokenizer.get_vocab()))
  elif phase == 3:
    tokenizer = get_tokenizer(3)
    config = RobertaConfig(hidden_size=768, intermediate_size=3072, num_of_attention_heads=12, num_hidden_layers=12, vocab_size=len(tokenizer.get_vocab()))
  else:
    assert False, "Phase not provided"

  return (AutoModelForMaskedLM.from_config(config), tokenizer)

In [7]:
import torch

In [8]:
def merge_type1(t_from, t_to):
  assert t_from.dim() == t_to.dim(), "Dimensions, from-to don't hold"
  assert t_from.dim() == 1, f"Expected 1D tensor, received {t_from.dim()}"
  to_ret = torch.cat((t_from, t_to[t_from.size()[-1]:]))
  return to_ret

In [9]:
def merge_type2(t_from, t_to):
  assert t_from.dim() == t_to.dim(), "Dimensions, from-to don't hold"
  assert t_from.dim() == 2, f"Expected 1D tensor, received {t_from.dim()}"
  to_ret = torch.cat((t_from, t_to[:t_from.size()[0], t_from.size()[-1]:]),  dim=-1)
  to_ret = torch.cat((to_ret, t_to[t_from.size()[0]:]))
  return to_ret

In [10]:
def merge_type3(t_from, t_to):
  assert t_from.dim() == t_to.dim(), "Dimensions, from-to don't hold"
  assert t_from.dim() == 2, f"Expected 1D tensor, received {t_from.dim()}"
  to_ret = torch.cat((t_from, t_to[:, t_from.size()[-1]:]),  dim=-1)
  return to_ret

In [11]:
def merge_type4(e_t_from, e_t_to, o_t_from, o_t_to, old_tokenizer, new_tokenizer, merge_out):
  assert e_t_from.dim() == e_t_to.dim(), "Dimensions, from-to don't hold"
  assert e_t_from.dim() == 2, f"Expected 1D tensor, received {e_t_from.dim()}"
  e_to_ret = merge_type2(e_t_from, e_t_to)
  o_to_ret = merge_type2(o_t_from, o_t_to)


  old_merges = list(old_tokenizer.bpe_ranks.items())
  new_merges = list(new_tokenizer.bpe_ranks.items())

  for i in range(len(old_merges), len(new_merges)):
    t1, t2 = new_merges[i][0]
    i1, i2 = new_tokenizer.convert_tokens_to_ids([t1, t2])

    token = t1 + t2
    id = new_tokenizer.convert_tokens_to_ids(token)
    e_to_ret[id] = torch.mean(torch.stack([e_to_ret[i1], e_to_ret[i2]]), dim=0)
    if merge_out:
      o_to_ret[id] = torch.mean(torch.stack([o_to_ret[i1], o_to_ret[i2]]), dim=0)

  return (e_to_ret, o_to_ret)

In [12]:
from torch.nn.parameter import Parameter
def prepare_embedding_and_out_layer(old_model, new_model, old_tokenizer, new_tokenizer, merge_embed, merge_out):
  old_embedding_layer = old_model.roberta.embeddings
  new_embedding_layer = new_model.roberta.embeddings
  old_lm_head_layer = old_model.lm_head
  new_lm_head_layer = new_model.lm_head

  # position_embeddings E
  new_model.roberta.embeddings.position_embeddings.weight = Parameter(merge_type3(
      old_embedding_layer.position_embeddings.weight.data,
      new_embedding_layer.position_embeddings.weight.data))
  # token_type_embeddings E
  new_model.roberta.embeddings.token_type_embeddings.weight = Parameter(merge_type3(
      old_embedding_layer.token_type_embeddings.weight.data,
      new_embedding_layer.token_type_embeddings.weight.data))
  # LayerNorm E
  new_model.roberta.embeddings.LayerNorm.weight = Parameter(merge_type1(
      old_embedding_layer.LayerNorm.weight.data,
      new_embedding_layer.LayerNorm.weight.data))

  # dense O
  new_model.lm_head.dense.weight = Parameter(merge_type2(
      old_lm_head_layer.dense.weight.data,
      new_lm_head_layer.dense.weight.data))
  # layer_norm O
  new_model.lm_head.layer_norm.weight = Parameter(merge_type1(
      old_lm_head_layer.layer_norm.weight.data,
      new_lm_head_layer.layer_norm.weight.data))

  new_embed_t = None
  new_dec_t = None
  if merge_embed:
    new_embed_t, new_dec_t = merge_type4(
        old_embedding_layer.word_embeddings.weight.data,
        new_embedding_layer.word_embeddings.weight.data,
        old_lm_head_layer.decoder.weight.data,
        new_lm_head_layer.decoder.weight.data,
        old_tokenizer,
        new_tokenizer,
        merge_out)
  else:
    new_embed_t = merge_type2(old_embedding_layer.word_embeddings.weight.data,
                                new_embedding_layer.word_embeddings.weight.data)
    new_dec_t = merge_type2(old_lm_head_layer.decoder.weight.data,
                              new_lm_head_layer.decoder.weight.data)
  # word_embeddings E
  new_model.roberta.embeddings.word_embeddings.weight = Parameter(new_embed_t)
  # decoder O
  new_model.lm_head.decoder.weight = Parameter(new_dec_t)

  return new_model


In [13]:
def prepare_encoder_layer(old_model, new_model):
  old_encoder_layers = old_model.roberta.encoder.layer
  new_encoder_layers = new_model.roberta.encoder.layer

  for idx, layer in enumerate(old_encoder_layers):
    # handles a single layer

    # query_a
    new_model.roberta.encoder.layer[idx].attention.self.query.weight = Parameter(merge_type2(
        layer.attention.self.query.weight.data,
        new_encoder_layers[idx].attention.self.query.weight.data))
    # key_a
    new_model.roberta.encoder.layer[idx].attention.self.key.weight = Parameter(merge_type2(
        layer.attention.self.key.weight.data,
        new_encoder_layers[idx].attention.self.key.weight.data))
    # value_a
    new_model.roberta.encoder.layer[idx].attention.self.value.weight = Parameter(merge_type2(
        layer.attention.self.value.weight.data,
        new_encoder_layers[idx].attention.self.value.weight.data))
    # dense_a
    new_model.roberta.encoder.layer[idx].attention.output.dense.weight = Parameter(merge_type2(
        layer.attention.output.dense.weight.data,
        new_encoder_layers[idx].attention.output.dense.weight.data))
    # LayerNorm_a
    new_model.roberta.encoder.layer[idx].attention.output.LayerNorm.weight = Parameter(merge_type1(
        layer.attention.output.LayerNorm.weight.data,
        new_encoder_layers[idx].attention.output.LayerNorm.weight.data))
    # dense_i
    new_model.roberta.encoder.layer[idx].intermediate.dense.weight = Parameter(merge_type2(
        layer.intermediate.dense.weight.data,
        new_encoder_layers[idx].intermediate.dense.weight.data))
    # dense_o
    new_model.roberta.encoder.layer[idx].output.dense.weight = Parameter(merge_type2(
        layer.output.dense.weight.data,
        new_encoder_layers[idx].output.dense.weight.data))
    # LayerNorm_o
    new_model.roberta.encoder.layer[idx].output.LayerNorm.weight = Parameter(merge_type1(
        layer.output.LayerNorm.weight.data,
        new_encoder_layers[idx].output.LayerNorm.weight.data))

  return new_model



In [14]:
def prepare_model_for_new_phase(old_model, new_model, old_tokenizer, new_tokenizer, merge_embed, merge_out):
  old_model.to(torch.device('cpu'))
  new_model.to(torch.device('cpu'))
  model = prepare_embedding_and_out_layer(old_model, new_model, old_tokenizer, new_tokenizer, merge_embed, merge_out)
  model = prepare_encoder_layer(old_model, model)
  return model

In [15]:
def preprocess_file(file_path, tokenizer, max_length = 506):

  with open(file_path, 'r') as file:
      lines = file.readlines()

  output_file_path = file_path + "v2"
  output_file = open(output_file_path, 'w')

  for line in lines:
      if line.strip() == "":
        continue
      tokenized_line = tokenizer.tokenize(line)
      if len(tokenized_line) > max_length:
          split_num = (len(tokenized_line) // 506) + 2
          split_line = line.split(" ")
          split_len = (len(split_line) // split_num)
          sublines = [" ".join(split_line[idx * split_len: (idx + 1) * split_len]) for idx in range(split_num + 1)]

          for subline in sublines:
              output_file.write(subline.strip() + '\n')
      else:
          output_file.write(line)

  output_file.close()

  return file_path + "v2"



In [17]:
from transformers import RobertaForMaskedLM, AdamW, get_linear_schedule_with_warmup, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import torch

def train_model(model, tokenizer, file_paths, batch_size, num_of_epochs, save_path=None, max_length=512, accumulation_steps=4):

  new_file_paths = []

  for file_path in file_paths:
    new_path = preprocess_file(file_path, tokenizer)
    new_file_paths.append(new_path)


  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  model.train()
  eos = tokenizer.eos_token_id

  def encode_examples(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length, return_attention_mask=True)
    labels = np.copy(tokenized['input_ids'])
    mask = tokenized['attention_mask']
    if labels[-1] == eos:
      print("entered eos")
      labels[-1] = tokenizer.pad_token_id
      labels[-2] = tokenizer.eos_token_id
      mask[-1] = 0

    labels[mask == 0] = -100
    return {"input_ids": tokenized['input_ids'], "labels": labels, "attention_mask": tokenized['attention_mask']}

  optimizer = AdamW(model.parameters(), lr=5e-5)

  for epoch in range(num_of_epochs):
    print(f"In epoch: {epoch+1}")

    for file_path in new_file_paths:
      print(f"Training on file: {file_path}")

      # Load and preprocess the dataset
      dataset = load_dataset('text', data_files=file_path)
      tokenized_dataset = dataset.map(encode_examples)
      tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels', 'attention_mask'])

      data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
      dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=False, collate_fn=data_collator)

      total_steps = len(dataloader) * num_of_epochs / accumulation_steps
      scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_steps, num_training_steps=total_steps)

      progress_bar = tqdm(dataloader, position=0, leave=True)

      optimizer.zero_grad()

      total_loss = 0.0
      total_steps = 0

      for idx, batch in enumerate(progress_bar):
        inputs = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**inputs)

        loss = outputs.loss
        loss = loss / accumulation_steps
        loss.backward()

        total_loss += loss.item() * accumulation_steps
        total_steps += 1

        if (idx+1) % accumulation_steps == 0:
          optimizer.step()
          scheduler.step()
          optimizer.zero_grad()

        avg_loss = total_loss / total_steps

        progress_bar.set_description(f"Loss: {avg_loss}")
        progress_bar.update()

    if save_path is not None:
      model.save_pretrained(save_path)



In [18]:
####### File paths to be processed on phase N ########
######################################################
phase1_data = ["babylm_data/babylm_10M/1-aochildes.train", "babylm_data/babylm_10M/2-qed.train"]
phase2_data = ["babylm_data/babylm_10M/3-open_subtitles.train", "babylm_data/babylm_10M/4-switchboard.train", "babylm_data/babylm_10M/5-cbt.train"]
phase3_data = ["babylm_data/babylm_10M/6-children_stories.train", "babylm_data/babylm_10M/7-gutenberg.train", "babylm_data/babylm_10M/8-simple_wikipedia.train",
               "babylm_data/babylm_10M/9-wikipedia.train", "babylm_data/babylm_10M/99-bnc_spoken.train"]
all_data = phase1_data + phase2_data + phase3_data

In [None]:
m1, t1 = init_model_and_tokenizer(1)
train_model(m1,t1, phase1_data, 32, 10, "/content/drive/MyDrive/Code_BabyLM/models/modelP1") ## Path to save the first model

In [None]:
m2, t2 = init_model_and_tokenizer(2)
prepare_model_for_new_phase(m1, m2, t1, t2, True, True)
train_model(m2, t2, phase2_data, 64, 5, "/content/drive/MyDrive/Code_BabyLM/models/modelP2") ## Path to save the second model

In [None]:
m3, t3 = init_model_and_tokenizer(3)
prepare_model_for_new_phase(m2, m3, t2, t3, True, True)
train_model(m3, t3, phase3_data, 32, 3, "/content/drive/MyDrive/Code_BabyLM/models/modelP3") ## path to save the third model