# Causal Language Modeling Task
A series of experiments demonstrating causal language modeling and training performance differences between two pretrained Reformer models. Models, datasets, and examples sourced from Huggingface.


---


## Test Models
**Reformer**
* 6-layer
* 256-hidden
* 2-heads
* 3M parameters
* Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.

**Reformer**
* 12-layer
* 1024-hidden
* 8-heads  
* 149M parameters
* Trained on English Wikipedia data - enwik8.








In [1]:
!pip install datasets transformers sentencepiece

Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 4.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 51.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 73.7 MB/s 
[?25hCollecting huggingface-hub<0.1.0,>=0.0.14
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 54.1 MB/s 
Collecting fsspec[http]>=2021.05.0

In [2]:
# Imports
import math, random, torch
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [3]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


In [10]:
def tokenize_function_ptb(examples):
    return tokenizer(examples["sentence"])

def tokenize_function_wt2_enwik8(examples):
    return tokenizer(examples["text"])
    
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

def prep_enwik8(path):
    file = path + 'enwik8'
    
    # Read file
    with open(file) as f:
        lines = f.readlines()
        
    random.shuffle(lines)

    # Calculate splits: 80/10/10 - train/val/test
    train_split = math.floor(len(lines)*.8)
    test_val_split = math.floor(len(lines)*.1)
    
    with open(path+'enwik8_train.txt', 'w') as train:
        with open(path+'enwik8_validation.txt', 'w') as val:
            with open(path+'enwik8_test.txt', 'w') as test:
                for i, line in enumerate(lines):
                    if i < train_split:
                        train.write(line)
                    elif i < train_split + test_val_split:
                        val.write(line)
                    else:
                        test.write(line)

In [9]:
# Hyperparameters
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 30
BATCH_SIZE = 16
block_size = 2048
PUSH_HUB = False
AXIAL_POS = False

# Dataset selection
DATASET_SELECT = 2  # 0 = wikitext-2, 1 = penn treebank, 2 = enwik8
PATH_TO_ENWIK8 = '/content/data/' 

In [6]:
model_id = 'google/reformer-crime-and-punishment'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, padding=True, truncation=True)

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316k [00:00<?, ?B/s]

In [11]:
if DATASET_SELECT == 0:
  datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
  tokenized_datasets = datasets.map(tokenize_function_wt2_enwik8, batched=True, num_proc=4, remove_columns=["text"])
elif DATASET_SELECT == 1:
  datasets = load_dataset("ptb_text_only")
  tokenized_datasets = datasets.map(tokenize_function_ptb, batched=True, num_proc=4, remove_columns=["sentence"])
elif DATASET_SELECT == 2:
  prep_enwik8(PATH_TO_ENWIK8)
  datasets = load_dataset('text', data_files={'train': PATH_TO_ENWIK8+'enwik8_train.txt','validation': PATH_TO_ENWIK8+'enwik8_validation.txt','test': PATH_TO_ENWIK8+'enwik8_test.txt'})
  tokenized_datasets = datasets.map(tokenize_function_wt2_enwik8, batched=True, num_proc=4, remove_columns=["text"])

Using custom data configuration default-9e1513e0da452a42


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-9e1513e0da452a42/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-9e1513e0da452a42/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#tokenized_datasets["train"][1]

In [12]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
#tokenizer.decode(lm_datasets["train"][1]["input_ids"])

In [13]:
model = AutoModelForCausalLM.from_pretrained(model_id, axial_pos_embds=AXIAL_POS).to(device)

Downloading:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/reformer-crime-and-punishment were not used when initializing ReformerModelWithLMHead: ['reformer.embeddings.position_embeddings.weights.0', 'reformer.embeddings.position_embeddings.weights.1']
- This IS expected if you are initializing ReformerModelWithLMHead from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ReformerModelWithLMHead from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ReformerModelWithLMHead were not initialized from the model checkpoint at google/reformer-crime-and-punishment and are newly initialized: ['reformer.encoder.layers.5.attention.self_attention.self_mask_value_float16', 'reformer.encoder.layers.0.attention.self_att

In [14]:
model_name = model_id.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    adafactor=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=PUSH_HUB,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
train_results = trainer.train()

***** Running training *****
  Num examples = 21364
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 40080
  query_key_dots = torch.where(mask, query_key_dots, mask_value)


Epoch,Training Loss,Validation Loss


Saving model checkpoint to reformer-crime-and-punishment-finetuned-wikitext2/checkpoint-500
Configuration saved in reformer-crime-and-punishment-finetuned-wikitext2/checkpoint-500/config.json
Model weights saved in reformer-crime-and-punishment-finetuned-wikitext2/checkpoint-500/pytorch_model.bin


In [None]:
trainer.save_model()

In [None]:
# Print Perplexity
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 92
  Batch size = 16


Perplexity: 24.54


In [None]:
# Display Metrics
metrics = train_results.metrics
trainer.log_metrics("train", metrics)

***** train metrics *****
  epoch                    =       30.0
  total_flos               =   984740GF
  train_loss               =     3.3416
  train_runtime            = 0:13:34.12
  train_samples_per_second =     43.372
  train_steps_per_second   =      2.727


In [None]:
trainer.push_to_hub()