Based on:
- https://huggingface.co/docs/transformers/en/training
- https://huggingface.co/docs/transformers/en/peft
- https://huggingface.co/docs/peft/quicktour
- https://jaotheboss.medium.com/peft-with-bert-8763d8b8a4ca
- https://huggingface.co/learn/nlp-course/en/chapter7/3
- https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling
- https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice

In [1]:
import pandas as pd
import spacy
import argparse
nlp = spacy.load('en_core_web_sm')

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_from_disk
from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, AutoModelForMultipleChoice, AutoTokenizer, PreTrainedTokenizerBase, SchedulerType, default_data_collator, get_scheduler, AutoModel, XLMRobertaTokenizer, XLMRobertaXLModel, AutoModelForMaskedLM, XLMRobertaXLConfig, XLMRobertaXLForMultipleChoice)
from torch.utils.data import DataLoader
import torch
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
LANG_CODE = 'yo'
DSETSIZE = 10000
SCORER = 'bm25'
NUM_EXAMPLES = 100
dataset = load_from_disk(f"../culturaldataset/select_datasets/{LANG_CODE}/{SCORER}-{DSETSIZE}")

In [4]:
# toy dataset
datasets = DatasetDict({
    'train': Dataset.from_dict({'score': [0.2, 0.1, 0.05]*10, 'example': ['The cat said meow', "This is text", 'Tokenizers are so confusing']*10}),
    'val': Dataset.from_dict({'score': [0.2]*20, 'example': ['checkpoint of a model trained on another task']*20})
})

In [5]:
datasets['train'][0]

{'score': 0.2, 'example': 'The cat said meow'}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# model = AutoModel.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

model = model.to('cpu')

# PEFT?
# model = get_peft_model(model, LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["query", "value"],
#     # target_modules=["q_lin", "v_lin"],
#     lora_dropout=0.1,
#     task_type=TaskType.FEATURE_EXTRACTION,
# ))
# print(f'trainable: {model.print_trainable_parameters()}')

# tokenizer = tokenizer.to('cpu')
print(f'num param: {model.num_parameters()}')
print(f'the model:')
model

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


num param: 278295186
the model:


XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [7]:
print(tokenizer.mask_token_id, tokenizer.decode(tokenizer.mask_token_id))

250001 <mask>


In [8]:
toy_text = f"Il y a toujours des {tokenizer.decode(tokenizer.mask_token_id)}."
toy_input = tokenizer(toy_text, return_tensors="pt")
toy_input

{'input_ids': tensor([[     0,    891,    113,     10,  11259,    224, 250001,      6,      5,
              2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
model(**toy_input).keys()

odict_keys(['logits'])

In [10]:
token_logits = model(**toy_input).logits
mask_token_index = torch.where(toy_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"filled: {toy_text.replace(tokenizer.mask_token, f'_{tokenizer.decode([token])}_')}")

filled: Il y a toujours des _moments_.
filled: Il y a toujours des _solutions_.
filled: Il y a toujours des _limites_.
filled: Il y a toujours des _jours_.
filled: Il y a toujours des _raisons_.


In [11]:
def tokenize_function(examples):
    # return tokenizer(examples["example"], padding="max_length", truncation=True, return_tensors="pt")
    result = tokenizer(examples["example"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# tokenized_dataset = dataset.select(range(NUM_EXAMPLES)).map(tokenize_function, batched=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["example", "score"])
tokenized_datasets

Map: 100%|██████████| 30/30 [00:00<00:00, 9293.83 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 7178.95 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 30
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 20
    })
})

In [12]:
for i in range(len(tokenized_datasets['train'])):
    print(tokenized_datasets['train'][i])
    if i == 2: break

{'input_ids': [0, 581, 7515, 2804, 163, 8770, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 1, 2, 3, 3, None]}
{'input_ids': [0, 3293, 83, 7986, 2], 'attention_mask': [1, 1, 1, 1, 1], 'word_ids': [None, 0, 1, 2, None]}
{'input_ids': [0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 0, 0, 0, 1, 2, 3, 3, None]}


In [13]:
tokenizer.model_max_length

512

In [14]:
chunk_size = 128

In [15]:
for idx, sample in enumerate(tokenized_datasets['train'][:3]["input_ids"]):
    print(f"example {idx} has len {len(sample)}")

example 0 has len 7
example 1 has len 5
example 2 has len 10


In [16]:
tokenized_samples = tokenized_datasets["train"][:3]

concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 22'


In [17]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 22'


In [18]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Concatenate all texts
    total_length = len(concatenated_examples[list(examples.keys())[0]]) # Compute length of concatenated texts
    total_length = (total_length // chunk_size) * chunk_size # We drop the last chunk if it's smaller than chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] 
        for k, t in concatenated_examples.items()
    } # Split by chunks of max_len
    result["labels"] = result["input_ids"].copy() # Create a new labels column
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 30/30 [00:00<00:00, 6912.55 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 5583.47 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
})

In [19]:
# example after processing
print(lm_datasets["train"][0])
# print(tokenizer.decode(lm_datasets["train"][0]['input_ids']))

{'input_ids': [0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621, 221, 55681, 6953, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 3293, 83, 7986, 2, 0, 717, 1098, 52825, 7, 621], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets["train"][i] for i in range(1)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"{tokenizer.decode(chunk)}") # masks get added by data collator

<s> The cat said meow</s><s> This is text</s><s> Tokenizers are so<mask>sing</s><s><mask> cat<mask> meow</s><s> This is<mask></s><s> Tokenizers are so confusing</s><s><mask> cat said meow</s><s> This is text</s><s> Token<mask><mask> are so confusing</s><s> The cat said<mask>ow</s><s> This is text</s><s> Tokenizers are so confusing</s><s> The cat said meow</s><s> This is text</s><s> Tokenizers are so confusing</s><s> The cat said meow</s><s> This is text</s><s> To<mask>izers are


### whole word masking thing?

In [22]:
import collections
import numpy as np
from transformers import default_data_collator

wwm_probability = 0.2

# the above collator only mask out tokens. this masks out whole word as a chunk?
def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [23]:
samples = [lm_datasets["train"][i] for i in range(1)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s> The cat said<mask><mask></s><s> This<mask> text</s><s> Tokenizers are so<mask><mask></s><s> The cat said meow</s><s> This is text</s><s> Tokenizers<mask> so confusing</s><s> The cat said meow</s><s> This is text</s><s> Tokenizers are so confusing</s><s> The<mask><mask> meow</s><s> This is text</s><s> Tokenizers are so confusing</s><s> The cat said meow</s><s> This is<mask></s><s> Tokenizers are so confusing</s><s> The cat<mask><mask><mask></s><s> This is text</s><s> Tokenizers are'


### training

In [24]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
})

In [25]:
downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=1, test_size=0, seed=42
)
downsampled_dataset

ValueError: test_size=0 should be either positive and smaller than the number of samples 1 or a float in the (0, 1) range

In [26]:
from transformers import TrainingArguments

batch_size = 16

training_args = TrainingArguments(
    output_dir=f"xlmr-finetuned",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # push_to_hub=True,
    # fp16=True,
    logging_steps=1,
    num_train_epochs=10
)

In [27]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00,  6.56it/s]


{'eval_loss': 5.526656627655029,
 'eval_runtime': 0.4064,
 'eval_samples_per_second': 2.46,
 'eval_steps_per_second': 2.46}

In [29]:
trainer.train()

 10%|█         | 1/10 [00:04<00:36,  4.09s/it]

{'loss': 23.0165, 'grad_norm': 674.8922119140625, 'learning_rate': 1.8e-05, 'epoch': 1.0}


                                              
 10%|█         | 1/10 [00:05<00:36,  4.09s/it]

{'eval_loss': 3.3105320930480957, 'eval_runtime': 0.9043, 'eval_samples_per_second': 1.106, 'eval_steps_per_second': 1.106, 'epoch': 1.0}


 20%|██        | 2/10 [00:06<00:26,  3.34s/it]

{'loss': 18.063, 'grad_norm': 1135.5369873046875, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


                                              
 20%|██        | 2/10 [00:07<00:26,  3.34s/it]

{'eval_loss': 7.034651279449463, 'eval_runtime': 0.2367, 'eval_samples_per_second': 4.225, 'eval_steps_per_second': 4.225, 'epoch': 2.0}


 30%|███       | 3/10 [00:08<00:17,  2.45s/it]

{'loss': 16.4206, 'grad_norm': 386.2488708496094, 'learning_rate': 1.4e-05, 'epoch': 3.0}


                                              
 30%|███       | 3/10 [00:08<00:17,  2.45s/it]

{'eval_loss': 5.43673038482666, 'eval_runtime': 0.2517, 'eval_samples_per_second': 3.973, 'eval_steps_per_second': 3.973, 'epoch': 3.0}


 40%|████      | 4/10 [00:09<00:12,  2.02s/it]

{'loss': 18.6924, 'grad_norm': 346.05450439453125, 'learning_rate': 1.2e-05, 'epoch': 4.0}


                                              
 40%|████      | 4/10 [00:09<00:12,  2.02s/it]

{'eval_loss': 6.2737932205200195, 'eval_runtime': 0.1853, 'eval_samples_per_second': 5.396, 'eval_steps_per_second': 5.396, 'epoch': 4.0}


 50%|█████     | 5/10 [00:11<00:09,  1.83s/it]

{'loss': 13.9201, 'grad_norm': 275.6600341796875, 'learning_rate': 1e-05, 'epoch': 5.0}


                                              
 50%|█████     | 5/10 [00:11<00:09,  1.83s/it]

{'eval_loss': 1.813073992729187, 'eval_runtime': 0.3399, 'eval_samples_per_second': 2.942, 'eval_steps_per_second': 2.942, 'epoch': 5.0}


 60%|██████    | 6/10 [00:12<00:07,  1.75s/it]

{'loss': 15.5467, 'grad_norm': 352.4468688964844, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}


                                              
 60%|██████    | 6/10 [00:13<00:07,  1.75s/it]

{'eval_loss': 1.9597558975219727, 'eval_runtime': 0.2955, 'eval_samples_per_second': 3.385, 'eval_steps_per_second': 3.385, 'epoch': 6.0}


 70%|███████   | 7/10 [00:14<00:04,  1.66s/it]

{'loss': 12.8491, 'grad_norm': 274.6159362792969, 'learning_rate': 6e-06, 'epoch': 7.0}


                                              
 70%|███████   | 7/10 [00:14<00:04,  1.66s/it]

{'eval_loss': 3.7310588359832764, 'eval_runtime': 0.3013, 'eval_samples_per_second': 3.319, 'eval_steps_per_second': 3.319, 'epoch': 7.0}


 80%|████████  | 8/10 [00:17<00:04,  2.05s/it]

{'loss': 13.9638, 'grad_norm': 339.20892333984375, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


                                              
 80%|████████  | 8/10 [00:17<00:04,  2.05s/it]

{'eval_loss': 3.528672933578491, 'eval_runtime': 0.3619, 'eval_samples_per_second': 2.763, 'eval_steps_per_second': 2.763, 'epoch': 8.0}


 90%|█████████ | 9/10 [00:20<00:02,  2.42s/it]

{'loss': 12.2228, 'grad_norm': 231.761962890625, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}


                                              
 90%|█████████ | 9/10 [00:20<00:02,  2.42s/it]

{'eval_loss': 2.9800865650177, 'eval_runtime': 0.2071, 'eval_samples_per_second': 4.829, 'eval_steps_per_second': 4.829, 'epoch': 9.0}


100%|██████████| 10/10 [00:21<00:00,  2.18s/it]

{'loss': 13.2901, 'grad_norm': 334.228271484375, 'learning_rate': 0.0, 'epoch': 10.0}


                                               
100%|██████████| 10/10 [00:22<00:00,  2.26s/it]

{'eval_loss': 2.034358024597168, 'eval_runtime': 0.5967, 'eval_samples_per_second': 1.676, 'eval_steps_per_second': 1.676, 'epoch': 10.0}
{'train_runtime': 22.5975, 'train_samples_per_second': 0.443, 'train_steps_per_second': 0.443, 'train_loss': 15.798519134521484, 'epoch': 10.0}





TrainOutput(global_step=10, training_loss=15.798519134521484, metrics={'train_runtime': 22.5975, 'train_samples_per_second': 0.443, 'train_steps_per_second': 0.443, 'train_loss': 15.798519134521484, 'epoch': 10.0})

In [30]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00,  9.54it/s]


{'eval_loss': 3.789834499359131,
 'eval_runtime': 0.1824,
 'eval_samples_per_second': 5.483,
 'eval_steps_per_second': 5.483,
 'epoch': 10.0}

---

In [54]:
print(model)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [55]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    # target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    task_type=TaskType.FEATURE_EXTRACTION,
)

In [56]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters() # see % trainable parameters

trainable params: 589,824 || all params: 278,633,472 || trainable%: 0.21168454592562375


In [57]:
print(lora_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=

In [58]:
tokenized_datasets = tokenized_datasets.rename_column("example", "text")
tokenized_datasets

Dataset({
    features: ['score', 'text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [59]:
toy_input = tokenizer(tokenized_datasets['text'][0][:100], return_tensors="pt")
toy_input = toy_input.to('cpu')
lora_model = lora_model.to('cpu')

In [60]:
lora_model(**toy_input)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0873,  0.1106,  0.0666,  ..., -0.0501,  0.0721, -0.0197],
         [-0.0081, -0.1164,  0.0249,  ..., -0.0507,  0.0636,  0.0125],
         [-0.0304,  0.1153,  0.0092,  ..., -0.0534, -0.0404,  0.1008],
         ...,
         [ 0.0829,  0.0443,  0.0210,  ..., -0.1380, -0.0108,  0.1600],
         [ 0.0543,  0.0591,  0.0068,  ..., -0.1866,  0.0065, -0.0482],
         [ 0.0727,  0.1022,  0.0085,  ..., -0.1361,  0.0007,  0.0193]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-3.0522e-02,  2.7087e-01,  1.1834e-01,  5.1281e-01,  4.8539e-03,
          3.6024e-01,  4.2014e-01, -4.4383e-01,  1.6263e-01, -1.5847e-01,
          1.3781e-01,  9.1416e-02,  3.8526e-01,  3.3253e-01, -2.0363e-01,
         -1.7617e-01,  1.8961e-01,  4.2476e-01, -7.3580e-02, -2.2573e-01,
         -2.4010e-01,  3.4361e-01, -6.7378e-01, -5.5283e-01, -2.1458e-01,
          5.7239e-01,  1.3006e-01, -3.1152e-01, -1.3167e-01,  6.425

In [47]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [48]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="test_trainer", 
        evaluation_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=16,
    ),
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/7 [06:36<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

TypeError: XLMRobertaModel.forward() got an unexpected keyword argument 'labels'

## copy paste

In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
toy_text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import torch

toy_input = tokenizer(toy_text, return_tensors="pt")
token_logits = model(**toy_input).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(toy_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {toy_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result
# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 15.7MB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:01<00:00, 18.4MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 25.3MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:01<00:00, 32.1MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 325430.46 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 386076.48 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 383714.98 examples/s]
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4874.58 examples/s]
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4939.38 examples/s]
Map: 100%|██████████| 50000/50000 [00:13<00:00, 3753.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [9]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 25000/25000 [00:41<00:00, 601.71 examples/s]
Map: 100%|██████████| 25000/25000 [00:40<00:00, 612.41 examples/s]
Map: 100%|██████████| 50000/50000 [01:24<00:00, 589.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [11]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented [MASK] am curious - yellow from my video store because of all the [MASK] that surrounded it when it was first released in 1967. i also heard that at first it was seized by [MASK]. s. customs if [MASK] ever [MASK] [MASK] enter this country, therefore [MASK] a fan of films [MASK] "ree [MASK] i really had to see this for myself. < br / > < br / [MASK] the [MASK] is centered around a young swedish [MASK] student named lena [MASK] [MASK] to learn everything she can about life. in particular she wants to [MASK] her attentions to making some sort of documentary on what the [MASK] sw [MASK] thought about certain political [MASK] such'

'>>> as the vietnam war and race issues in the united states. in [MASK] asking [MASK] and ordinary denizens of stockholm about [MASK] opinions on politics, she [MASK] sex president her drama teacher, classmates, and married men. < br / > < [MASK] / > what kills me about i am curious - yellow is [MASK] 40orth ago [MASK] this was [MASK] pornog