# Difficulty Criteria

In [1]:
# !python -m spacy download en_core_web_sm

In [2]:
# Helper Functions
import spacy
from spacy_syllables import SpacySyllables
def count_syllable(text: str, ret: str = 'sum'):
  nlp = spacy.load('en_core_web_sm')
  syllables = SpacySyllables(nlp)
  nlp.add_pipe('syllables', after='tagger')
  doc = nlp(text)

  data = [
      (token.text, token._.syllables, token._.syllables_count)
      for token in doc
  ]
  count_data = [data[i][2] for i in range(len(data)) if data[i][2] != None]
  if (ret == 'sum'):
    return sum(count_data)
  else: # ret == 'data'
    return count_data

In [3]:
# Warning!
# count_syllable function used to count the amount of syllable in a corpus
# is not 100% accurate. It is generally good enough, but its error can be
# extremely detrimental to corpus difficulty testing on small sentences

# sentence['dif'] = len(sentence['text'])
# return sentence
import math
import string
def criteria_length(texts: str) -> float:
  """
  Length Criteria:
    Number of Words in a corpus
    Higher Value = Harder Corpus
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    ret['dif'].append(len(text.split()))
  return ret
###############################################################################
def criteria_fre(texts: str, precision: int = 2) -> float:
  """
  FRE Criteria:
    Higher Value = Easier Corpus
    Speed = Slow (Cause -> count_syllable)
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    syllable_count = count_syllable(text)
    fre = 206.835 - 1.015*(word_count/sentence_count) - 84.6*(syllable_count/word_count)

    ret['dif'].append(round(fre, precision))

  return ret
###############################################################################
def criteria_fkg(texts: str, precision: int = 2) -> float:
  """
  FKG Criteria:
    Higher Value = Harder Corpus
    Speed = Slow (Cause -> count_syllable)
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    syllable_count = count_syllable(text)
    fkg = 0.39 * (word_count/sentence_count) + 11.8 * (syllable_count/word_count) - 15.59

    ret['dif'].append(round(fkg, precision))

  return ret 
###############################################################################
def criteria_cli(texts: str, precision: int = 2) -> float:
  """
  CLI Criteria:
    Higher Value = Harder Corpus
  """
  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    word_count = len(text.split(' '))
    letters = sum(c.isalpha() for c in text)
    L = letters / word_count * 100
    S = sentence_count / word_count * 100
    cli = 0.0588*L - 0.296*S - 15.8

    ret['dif'].append(round(cli, precision))
  return ret
###############################################################################
def criteria_smog(texts: str, precision: int = 2) -> float:
  """
  SMOG Criteria:
    Higher Value = Harder Corpus
    Corpus must have at least 30 sentences
  """

  ret = {}
  ret['dif'] = []
  for text in texts: 
    sentence_count = len(text.split('.'))
    if (text.endswith('.')):
      sentence_count -= 1
    
    assert sentence_count >= 30, f"Can only be used if the corpus contain at least 30 sentences"
    
    polysyllables = count_syllable(text, ret='data')
    polysyllables = [1 if e > 2 else 0 for e in polysyllables]
    polysyllables = sum(polysyllables)

    smog = 1.043 * math.sqrt(polysyllables * (30 / sentence_count)) + 3.1291

    ret['dif'].append(round(smog, precision))

  return ret
###############################################################################
def criteria_ttr(texts: str, precision: int = 2) -> float:
  """
  TTR Criteria:
    Higher Value = Harder Corpus
    Range = [0, 1]
  """
  ret = {}
  ret['dif'] = []
  for text in texts:  
    words = text.translate(str.maketrans('', '', string.punctuation))
    words = words.lower()
    words = words.split(' ')
    unique_words = set(words)
    ret['dif'].append(round(len(unique_words)/len(words), precision)) 
  return ret

# Pipeline Version

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

import os
import html
import json
import torch.nn.functional as F

def singleGpu_training(model_name, output_dir, data_path, curriculum, model_checkpoint, tokenizer_checkpoint, truncation,
                       truncation_len, mlm_probability, num_epochs, train_batch_size, do_eval, test_size, 
                       eval_batch_size, criteria_func, log_strategy, log_steps, log_epochs):
    """
    Usage:
        Train MLM Models single GPU
    Input:
        model_name (string) = Model Identifier
        output_dir (string) = FOLDER to save model at each checkpoint (log_steps mark) and other information
        data_path (string)  = Where is the data located?
        curriculum (string) = "no_cl" -> No Curriculum, "easy_first" -> Curriculum Learning, "hard_first" -> Anti Curriculum
                               no_cl also Shuffles the dataset
        model_checkpoint (string) = What model to use? Path to model if Local, Huggingface model name otherwise
        tokenizer_checkpoint (string) = What tokenizer to use? Path to tokenizer if Local, Huggingface tokenizer name otherwise
        truncation (boolean) = Truncate tokenized input to truncation_len if True
        truncation_len (integer) = Ignore if truncation = False.
        mlm_probability (float) = range of [0, 1], used for DataCollatorForLanguageModeling
        num_epochs (integer) = How many epoch the model should do
        train_batch_size (integer) = batch size for training
        do_eval (boolean) = Do evaluation if True
        test_size (float) = range of [0, 1], used for splitting dataset
        eval_batch_size (integer) = batch size for evaluation. Ignored if do_eval is False
        criteria_func (function) = function used to check if dataset is difficult or not
        log_strategy (string) = "steps" or "epochs"
        log_steps (integer) = How many steps taken before logging the information. Ignored if log_strategy = "epochs"
        log_epochs (integer) = How many epochs taken before logging the information. Ignored if log_strategy = "steps" 
    Output:
        Models and Important data at output_dir
    """
    # Data Cleaning Helpers
    def clean_xa0(text):
        ret = {}
        ret['text'] = []
        for x in text:
            ret['text'].append(x.replace(u'\xa0', u' '))
        return ret

    def clean_slashApos(text):
        ret = {}
        ret['text'] = []
        for x in text:
            ret['text'].append(x.replace(u"\'", u""))
        return ret

    def clean_u200e(text):
        ret = {}
        ret['text'] = []
        for x in text:
            ret['text'].append(x.replace(u"\u200e", u""))
        return ret

    def get_len(text):
        ret = {}
        ret['len'] = []
        for x in text:
            ret['len'].append(len(x.split()))
        return ret

    # Encoding Helpers
    def encode_with_truncation(examples, max_length = 512):
        """Mapping function to tokenize the sentences passed with truncation"""
        return tokenizer(examples["text"], truncation=True, return_special_tokens_mask=True)

    def encode_without_truncation(examples):
        """Mapping function to tokenize the sentences passed without truncation"""
        return tokenizer(examples["text"], return_special_tokens_mask=True)

    # Sub Pipelines
    cleaner_functions = [clean_xa0, clean_slashApos, clean_u200e]
    def dataCleaning_pipeline(data, cleaner_functions : list = cleaner_functions, len_cutoff : int = 2, verbose: bool = False):
        for cleaner in cleaner_functions:
            if (verbose):
                print(f'Cleaning using function {cleaner.__name__}')
            data = data.map(lambda x : cleaner(x['text']), batched=True)

        if (verbose):
            print(f'Cleaning escapes ("\\")')
        data = data.filter(lambda x : '\\' not in x['text'])

        if (verbose):
            print(f'Removing entry with less than {len_cutoff} word(s)')
        data = data.map(lambda x : get_len(x['text']), batched=True)
        data = data.filter(lambda x : x['len'] > len_cutoff)
        data = data.remove_columns(['len'])

        if (verbose):
            print(f'Unescaping html symbols')
        data = data.map(lambda x: {'text' : html.unescape(x['text'])}, batched=True)

        return data

    def curriculumLearning_pipeline(data, criteria_func, curriculum = 'easy_first', verbose = False):
        assert curriculum in ['easy_first', 'hard_first'], f"curriculum must be 'easy_first' or 'hard_first'"

        if (verbose):
            print(f'Automatically rating corpus difficulty...')
        data = data.map(lambda x : criteria_func(x['text']), batched=True)

        if (verbose):
            print(f'Sorting corpus based on difficulty with rule of ', end ='')
            if (curriculum == 'easy_first'):
                print('easy to hard')
            else:
                print('hard to easy')

        if (curriculum == 'easy_first' and criteria_func.__name__ == 'criteria_fre'):
            data = data.sort('dif', reverse=True)
        elif (curriculum == 'hard_first' and criteria_func.__name__ == 'criteria_fre'):
            data = data.sort('dif')
        elif (curriculum == 'easy_first'):
            data = data.sort('dif')
        elif (curriculum == 'hard_first'):
            data = data.sort('dif', reverse=True)

        data = data.remove_columns(['dif'])
        return data

    # Main Function
    
    # Create Dictionary for Logging
    trainer_state = {}
    trainer_state['model_name']  = model_name
    trainer_state['log_history'] = []

    # Load Dataset + Splitting
    text_dataset = load_dataset("text", data_files=data_path)
    data = text_dataset['train'].filter(lambda x: x['text'] != '')
    
    if (do_eval):
        data = data.train_test_split(test_size = test_size, seed = 0)
    
    # Automated Cleaning + Curriculum Learning Set Up
    data = dataCleaning_pipeline(data, len_cutoff = 5, verbose=True)
    assert curriculum in ['no_cl', 'easy_first', 'hard_first'], f"curriculum must be 'no_cl' or easy_first' or 'hard_first'"
    if (curriculum != 'no_cl'):
        data = curriculumLearning_pipeline(data, criteria_func, curriculum = curriculum, verbose=True)
    
    # Get Models and Tokenizer
    checkpoint = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
    model_conf = BertForMaskedLM.from_pretrained(model_checkpoint).config
    model = BertForMaskedLM(model_conf)

    # Tokenize Dataset
    tokenized_dataset = data.map(encode_with_truncation, batched=True)

    # Create Folder Path if doesn't exist
    if (os.path.isdir(output_dir) == False):
        os_path = output_dir
        os.makedirs(output_dir)
    
    # Set Devices
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Set Up Data Collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability
    )

    if (curriculum == "no_cl"):
        shuffle=True
    else:
        shuffle=False
    if (do_eval):
        train_dataset = tokenized_dataset['train']
        train_dataset = train_dataset.remove_columns(['text'])
        train_dataloader = DataLoader(train_dataset, batch_size = train_batch_size, collate_fn = data_collator, shuffle=shuffle)

        eval_dataset = tokenized_dataset['test']
        eval_dataset = eval_dataset.remove_columns(['text'])
        eval_dataloader = DataLoader(eval_dataset, batch_size = eval_batch_size, collate_fn = data_collator, shuffle=shuffle)
    else:
        train_dataset = tokenized_dataset.remove_columns(['text'])
        train_dataloader = DataLoader(train_dataset, batch_size = train_batch_size, collate_fn = data_collator, shuffle=shuffle)

    # Preparing Tensors
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Set Up Scheduler
    num_epoch = 1
    num_training_steps = num_epoch * len(train_dataset)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    # Main Training Loop
    print("Training: ")
    progress_bar = tqdm(range(len(train_dataloader)))
    step_num = 0
    sum_train_loss = 0
    model.train()
    for epoch in range(num_epoch):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            sum_train_loss += loss.item()

            if (log_strategy == "steps"):
                step_num = step_num + 1
                if (step_num % log_steps == 0):
                    
                    #Location to Save Files
                    log_path = os.path.join(output_dir, model_name)
                    log_path = os.path.join(log_path, f"{model_name}-checkpoint{step_num}")
                    if (os.path.isdir(log_path) == False):
                        os.makedirs(log_path)
                    
                    # Create Log Dictionary
                    log_history = {}
                    log_history['epoch'] = step_num/len(train_dataloader)
                    log_history['loss'] = sum_train_loss/log_steps
                    log_history['steps'] = step_num
                    trainer_state['log_history'].append(log_history)
                    
                    # Save Model at Checkpoint
                    model.save_pretrained(log_path)
                    
                    # Save Training Data
                    with open(f"{log_path}/trainer_state.json", "w") as outfile:
                        json.dump(trainer_state, outfile, indent=4)
                    
                    sum_train_loss = 0
            elif (log_strategy == "epochs"):
                step_num = step_num + 1
                if (step_num % (num_training_steps * log_epochs) == 0):
                    
                    #Location to Save Files
                    log_path = os.path.join(output_dir, model_name)
                    log_path = os.path.join(log_path, f"{model_name}-checkpoint{step_num}")
                    if (os.path.isdir(log_path) == False):
                        os.makedirs(log_path)
                    
                    # Create Log Dictionary
                    log_history = {}
                    log_history['epoch'] = step_num/len(train_dataloader)
                    log_history['loss'] = sum_train_loss/log_steps
                    log_history['steps'] = step_num
                    trainer_state['log_history'].append(log_history)
                    
                    # Save Model at Checkpoint
                    model.save_pretrained(log_path)
                    
                    # Save Training Data
                    with open(f"{log_path}/trainer_state.json", "w") as outfile:
                        json.dump(trainer_state, outfile, indent=4)
                    
                    sum_train_loss = 0

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    if (do_eval):
        print("Evaluating: ")
        eval_sum_loss = 0
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
                loss = outputs.loss
                eval_sum_loss += loss.item()
        print(eval_sum_loss/len(eval_dataloader))

In [6]:
from ipywidgets import FloatProgress
singleGpu_training("bert-base-uncased-1p-criteria_cli", "models", "data/en-1.txt", "easy_first", "bert-base-uncased", "bert-base-uncased", True,
                       512, 0.15, 1, 32, True, 0.2, 
                       32, criteria_cli, "steps", 5300, -1)

Using custom data configuration default-441aa952fcf4cd9a
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-81f2b73037851983.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-7539a87f8009539a.arrow and /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-0c952285a724c80c.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-dca43d01b39f4788.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-740a4692d47f7a22.arrow
Loading cached processed

Cleaning using function clean_xa0
Cleaning using function clean_slashApos
Cleaning using function clean_u200e
Cleaning escapes ("\")
Removing entry with less than 5 word(s)


Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-d9467c2b4b9e9cf8.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-441aa952fcf4cd9a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-0da440006ee90e05.arrow


Unescaping html symbols
Automatically rating corpus difficulty...


  0%|          | 0/1707 [00:00<?, ?ba/s]

  0%|          | 0/427 [00:00<?, ?ba/s]

Sorting corpus based on difficulty with rule of easy to hard


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1707 [00:00<?, ?ba/s]

  0%|          | 0/427 [00:00<?, ?ba/s]

Training: 


  0%|          | 0/53313 [00:00<?, ?it/s]

Evaluating: 
6.608452570145013


In [None]:
torch.cuda.empty_cache()