In [3]:
# Helper Functions
import copy
from collections import OrderedDict

def updateModel(model, optimizer, freeze=False, mode="double"):
  """
    model     : a Pytorch Model
    optimizer : a Pytorch optimizer
    freeze    : Should the model freeze all of the previous encoder layers? Default = False
    mode      : How should layers be added?
                1. "double" (Default) --> Double the encoder layer
                2. "mean_double"      --> Double the encoder layer, but each added layers are the mean of the previous layers
                3. "mean_single"      --> Add a single layer, with it weights being the mean of the previous layers
  """

  # Copy bert encoder layers
  unfrozen = copy.deepcopy(model.bert.encoder.layer)

  # What is the mode?
  if ("mean" in mode.split('_')):
    unfrozen = single_layer(unfrozen, "mean")
    
  # Freeze layers that have been trained
  if (freeze):
    for params in model.bert.encoder.layer.parameters():
        params.requires_grad = False
  

  # Add layers of encoder to the model
  if (mode == "double"):
    for i in range(len(model.bert.encoder.layer)):
      model.bert.encoder.layer.append(copy.deepcopy(unfrozen[i]))
  elif ("single" in mode):
    model.bert.encoder.layer.append(copy.deepcopy(unfrozen))
  elif ("double" in mode):
    for i in range(len(model.bert.encoder.layer)):
      model.bert.encoder.layer.append(copy.deepcopy(unfrozen))
  
  new_optimizer = AdamW(model.parameters(), lr=5e-5)
  return model, new_optimizer

def single_layer(layer, mode):
  sdUnfrozen = layer.state_dict()

  sum_layer = OrderedDict()
  done_queries = []
  final_layer = OrderedDict()
  mode = 'mean'
  for key in sdUnfrozen:
    q = '.'.join(key.split('.')[1:])
    if (q not in done_queries):
      done_queries.append(q)
      lay_num = 0
      for key in sdUnfrozen:
        if ('.'.join(key.split('.')[1:]) == q):
          if (lay_num == 0):
            sum_layer[q] = copy.deepcopy(sdUnfrozen[key])
          else:
            sum_layer[q] += sdUnfrozen[key]
          lay_num += 1
      if (mode == 'mean'):
        final_layer[q] = copy.deepcopy(sum_layer[q])/(lay_num)

  singular_unfrozen = layer[0]
  singular_unfrozen.load_state_dict(final_layer)

  for params in singular_unfrozen.parameters():
    params.requires_grad = True
  
  return singular_unfrozen


In [4]:
# Criterias
import math
import string
import syllables

def criteria_length(texts: str) -> float:
  """
  Length Criteria:
    Number of Words in a corpus
    Higher Value = Harder Corpus
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    ret['dif'].append(len(text.split()))
  return ret
###############################################################################
def criteria_fre(texts: str, precision: int = 2) -> float:
  """
  FRE Criteria:
    Higher Value = Easier Corpus
    Speed = Slow (Cause -> count_syllable)
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    syllable_count = syllables.estimate(text)
    fre = 206.835 - 1.015*(word_count/sentence_count) - 84.6*(syllable_count/word_count)

    ret['dif'].append(round(fre, precision))

  return ret
###############################################################################
def criteria_fkg(texts: str, precision: int = 2) -> float:
  """
  FKG Criteria:
    Higher Value = Harder Corpus
    Speed = Slow (Cause -> count_syllable)
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    syllable_count = count_syllable(text)
    fkg = 0.39 * (word_count/sentence_count) + 11.8 * (syllable_count/word_count) - 15.59

    ret['dif'].append(round(fkg, precision))

  return ret 
###############################################################################
def criteria_cli(texts: str, precision: int = 2) -> float:
  """
  CLI Criteria:
    Higher Value = Harder Corpus
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    letters = sum(c.isalpha() for c in text)
    L = letters / word_count * 100
    S = sentence_count / word_count * 100
    cli = 0.0588*L - 0.296*S - 15.8

    ret['dif'].append(round(cli, precision))
  return ret
###############################################################################
def criteria_smog(texts: str, precision: int = 2) -> float:
  """
  SMOG Criteria:
    Higher Value = Harder Corpus
    Corpus must have at least 30 sentences
  """

  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    
    assert sentence_count >= 30, f"Can only be used if the corpus contain at least 30 sentences"
    
    polysyllables = count_syllable(text, ret='data')
    polysyllables = [1 if e > 2 else 0 for e in polysyllables]
    polysyllables = sum(polysyllables)

    smog = 1.043 * math.sqrt(polysyllables * (30 / sentence_count)) + 3.1291

    ret['dif'].append(round(smog, precision))

  return ret

###############################################################################
def criteria_ttr(texts: str, precision: int = 2) -> float:
  """
  TTR Criteria:
    Higher Value = Harder Corpus
    Range = [0, 1]
  """
  ret = {}
  ret['dif'] = []
  for text in texts:  
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.lower()
    words = words.split(' ')
    unique_words = set(words)
    ret['dif'].append(round(len(unique_words)/len(words), precision)) 
  return ret

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from tqdm.auto import tqdm

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

import time


def train(model_name, output_dir, data_path,
          checkpoint, progressive_stacking, curriculum_learning,
          curriculum, curriculum_criteria,
          train_batch_size, num_epoch, accum_iter,
          update_steps, stacking_strategy):
    """
    Input:
        model_name : string = What to save the model as
        output_dir : string = Folder location to save the model
        data_path  : string = File location to the dataset that should be used
        checkpoint : string = Location to a folder with a pytorch_model.bin and config.json OR
                              String with hugging-face checkpoint
        progressive_stacking : bool = Do you want to use progressive_stacking?
        curriculum_learning  : bool = Do you want to use curriculum_learning?
        curriculum : string = ['easy_first', 'hard_first'] (Choose one), used if curriculum_learning = True
        curriculum_criteria : function = Look above for list of criteria, choose one. 
        train_batch_size : int = Size of batch_size
        accum_iter       : int = Defaults to 1. Allows for gradient accumulation.
        update_steps     : list of float = List of percentage, tells the model when to update
                                           Used if progressive_stacking = True
                                           Example = [0.1, 0.25]
        stacking_strategy : string = One of:
                                     "double"   --> Duplicate and stack
                                     "mean_double" --> Find mean of all previous layer and stack
                                     "mean_single" --> Fine mean of all previous layer, add single layer
        
        
    """
    # Encoding Helpers
    def encode_with_truncation(examples, max_length = 512):
        """Mapping function to tokenize the sentences passed with truncation"""
        return tokenizer(examples["text"], truncation=True, return_special_tokens_mask=True)

    def encode_without_truncation(examples):
        """Mapping function to tokenize the sentences passed without truncation"""
        return tokenizer(examples["text"], return_special_tokens_mask=True)
    
    
    def curriculumLearning_pipeline(data, criteria_func, curriculum = 'easy_first', verbose = False):
        assert curriculum in ['easy_first', 'hard_first'], f"curriculum must be 'easy_first' or 'hard_first'"

        if (verbose):
            print(f'Automatically rating corpus difficulty...')
        data = data.map(lambda x : criteria_func(x['text']), batched=True)

        if (verbose):
            print(f'Sorting corpus based on difficulty with rule of ', end ='')
            if (curriculum == 'easy_first'):
                print('easy to hard')
            else:
                print('hard to easy')

        if (curriculum == 'easy_first' and criteria_func.__name__ == 'criteria_fre'):
            data = data.sort('dif', reverse=True)
        elif (curriculum == 'hard_first' and criteria_func.__name__ == 'criteria_fre'):
            data = data.sort('dif')
        elif (curriculum == 'easy_first'):
            data = data.sort('dif')
        elif (curriculum == 'hard_first'):
            data = data.sort('dif', reverse=True)

        data = data.remove_columns(['dif'])
        return data
    
    # Load dataset
    text_dataset = load_dataset("text", data_files=data_path)['train']
    print(f"Text dataset: {text_dataset}")
    
    # Get Models and Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model_conf = BertForMaskedLM.from_pretrained(checkpoint).config
    model = BertForMaskedLM(model_conf)
    
    if (progressive_stacking):
        # Set Model to 3 Encoder Layer
        for i in range(11, 2, -1):
          del model.bert.encoder.layer[i]
    
    if (curriculum_learning):
        text_dataset = curriculumLearning_pipeline(text_dataset, curriculum_criteria, curriculum = curriculum, verbose=True)
        
    # Tokenize Dataset
    tokenized_dataset = text_dataset.map(encode_with_truncation, batched=True)
    
    # Set Devices
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Set Up Data Collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    
    # Set up DataLoader
    if (curriculum_learning):
        shuffle = False
    else:
        shuffle = True
    train_dataset = tokenized_dataset.remove_columns(['text'])
    train_dataloader = DataLoader(train_dataset, batch_size = train_batch_size, collate_fn = data_collator, shuffle=True)
    print(f"Len of train_dataloader = {len(train_dataloader)}")
    
    # Preparing Tensors
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Set Up Constants (Insert scheduler here)
    num_training_steps = int(num_epoch / accum_iter * len(train_dataloader))
    
    # Update steps
    if (progressive_stacking):
        update_steps = [int(e * num_training_steps) for e in update_steps]
        print(f"Model will be updated at {update_steps}")
    else:
        update_steps = [-1]
        
    # Main Training Loop
    print("Training: ")
    print(f"Train Total Batch Size = {accum_iter * train_batch_size}")
    print(f"Number of steps to be taken = {num_training_steps}")
    progress_bar = tqdm(range(num_training_steps))
    model.train()
    step = 0
    start_time = time.time()
    sum_elapsed_time = 0
    for epoch in range(num_epoch):
        for (i, batch) in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            # Gradient Accumulation
            if (((i % accum_iter) == 0) or (i + 1 == len(train_dataloader))):
                optimizer.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                wandb.log({"loss": loss})
                elapsed_time = time.time() - start_time
                sum_elapsed_time += elapsed_time
                wandb.log({"time(s)" : elapsed_time})
                wandb.log({"total_time(s)" : sum_elapsed_time})
                step += 1

                if (step in update_steps):
                    print("Updating Model:")
                    print(f"Previously at {len(model.bert.encoder.layer)}")
                    model, optimizer = updateModel(model, optimizer, freeze=False, mode=stacking_strategy)
                    print(f"Currently at {len(model.bert.encoder.layer)}")

                start_time = time.time()
    
    SAVE_PATH = f"model/{model_name}"
    model.save_pretrained(SAVE_PATH)

In [6]:
single_update_steps = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
double_update_steps = [0.1, 0.25]

train(model_name = "5p_mix_easy-first_single-mean",
      output_dir = "model",
      data_path = "data/en-5.txt",
      checkpoint = "bert-base-uncased", 
      progressive_stacking = True, 
      curriculum_learning = True,
      curriculum = "easy_first", 
      curriculum_criteria = criteria_ttr,
      train_batch_size = 32, 
      num_epoch = 1, 
      accum_iter = 8,
      update_steps = single_update_steps, 
      stacking_strategy = "mean_single")

Using custom data configuration default-b215a6c16c929d49
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-b215a6c16c929d49/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

Text dataset: Dataset({
    features: ['text'],
    num_rows: 10306022
})


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-b215a6c16c929d49/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-36800f01a61b2d90.arrow
Loading cached sorted indices for dataset at /root/.cache/huggingface/datasets/text/default-b215a6c16c929d49/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655d

Automatically rating corpus difficulty...
Sorting corpus based on difficulty with rule of easy to hard


Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-b215a6c16c929d49/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-e14c27a178f2a397.arrow


Len of train_dataloader = 322064
Model will be updated at [2012, 4025, 6038, 8051, 10064, 12077, 14090, 16103, 18116]
Training: 
Train Total Batch Size = 256
Number of steps to be taken = 40258


  0%|          | 0/40258 [00:00<?, ?it/s]

Updating Model:
Previously at 3
Currently at 4
Updating Model:
Previously at 4
Currently at 5
Updating Model:
Previously at 5
Currently at 6
Updating Model:
Previously at 6
Currently at 7
Updating Model:
Previously at 7
Currently at 8
Updating Model:
Previously at 8
Currently at 9
Updating Model:
Previously at 9
Currently at 10
Updating Model:
Previously at 10
Currently at 11
Updating Model:
Previously at 11
Currently at 12
