Based on:
- https://huggingface.co/docs/transformers/en/training
- https://huggingface.co/docs/transformers/en/peft
- https://huggingface.co/docs/peft/quicktour
- https://jaotheboss.medium.com/peft-with-bert-8763d8b8a4ca
- https://huggingface.co/learn/nlp-course/en/chapter7/3
- https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling
- https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice

In [2]:
from datasets import load_dataset
import spacy
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_from_disk
from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, AutoModelForMultipleChoice, AutoTokenizer, PreTrainedTokenizerBase, SchedulerType, default_data_collator, get_scheduler, AutoModel, XLMRobertaTokenizer, XLMRobertaXLModel, AutoModelForMaskedLM, XLMRobertaXLConfig, XLMRobertaXLForMultipleChoice)
from peft import AutoPeftModel
from torch.utils.data import DataLoader
import torch
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# # will use later
# LANG_CODE = 'yo'
# DSETSIZE = 10000
# SCORER = 'bm25'
# NUM_EXAMPLES = 100
# dataset = load_from_disk(f"../culturaldataset/select_datasets/{LANG_CODE}/{SCORER}-{DSETSIZE}")
corpus = load_dataset("chaosarium/c4-cultural-extract", revision='su-bm25-50000')

Downloading readme: 100%|██████████| 319/319 [00:00<00:00, 1.12MB/s]
Downloading data: 100%|██████████| 145M/145M [00:04<00:00, 31.6MB/s] 
Generating train split: 100%|██████████| 50000/50000 [00:00<00:00, 150552.67 examples/s]


DatasetDict({
    train: Dataset({
        features: ['score', 'example'],
        num_rows: 50000
    })
})

# MLM Objective

In [35]:
# toy dataset
datasets = DatasetDict({
    'train': Dataset.from_dict({'score': [0.2, 0.1, 0.05]*10, 'example': ['The cat said meow', "Cats say meow", 'Tokenizers are so meow']*10}),
    'val': Dataset.from_dict({'score': [0.2]*20, 'example': ['A sound cats like to make is meow']*20})
})

In [36]:
datasets['train'][0]

{'score': 0.2, 'example': 'The cat said meow'}

In [40]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# model = AutoModel.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForMultipleChoice.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoPeftModel.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

model = model.to('cpu')

# PEFT?
# model = get_peft_model(model, LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["query", "value"],
#     # target_modules=["q_lin", "v_lin"],
#     lora_dropout=0.1,
#     # task_type=TaskType.FEATURE_EXTRACTION,
# ))
# print(f'trainable: {model.print_trainable_parameters()}')

# tokenizer = tokenizer.to('cpu')
print(f'num param: {model.num_parameters()}')
print(f'the model:')
model

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


num param: 278295186
the model:


XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [43]:
print(tokenizer.mask_token_id, tokenizer.decode(tokenizer.mask_token_id))

250001 <mask>


In [44]:
# toy_text = f"Il y a toujours des {tokenizer.decode(tokenizer.mask_token_id)}."
toy_text = f"The cat said {tokenizer.decode(tokenizer.mask_token_id)}."
toy_input = tokenizer(toy_text, return_tensors="pt")
toy_input

{'input_ids': tensor([[     0,    581,   7515,   2804, 250001,      6,      5,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [45]:
model(**toy_input).keys()

odict_keys(['logits'])

In [60]:
model.to('cpu')
token_logits = model(**toy_input).logits
mask_token_index = torch.where(toy_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"filled: {toy_text.replace(tokenizer.mask_token, f'_{tokenizer.decode([token])}_')}")

filled: The cat said _:_.
filled: The cat said _,_.
filled: The cat said _that_.
filled: The cat said _:_.
filled: The cat said _._.
filled: The cat said _it_.
filled: The cat said _he_.
filled: The cat said _"_.
filled: The cat said _to_.
filled: The cat said _..._.


In [61]:
def tokenize_function(examples):
    # return tokenizer(examples["example"], padding="max_length", truncation=True, return_tensors="pt")
    result = tokenizer(examples["example"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# tokenized_dataset = dataset.select(range(NUM_EXAMPLES)).map(tokenize_function, batched=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["example", "score"])
tokenized_datasets

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map: 100%|██████████| 30/30 [00:00<00:00, 7996.26 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 6451.29 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 30
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 20
    })
})

In [62]:
for i in range(len(tokenized_datasets['train'])):
    print(tokenized_datasets['train'][i])
    if i == 2: break

{'input_ids': [0, 581, 7515, 2804, 163, 8770, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 1, 2, 3, 3, None]}
{'input_ids': [0, 18826, 7, 5154, 163, 8770, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 0, 1, 2, 2, None]}
{'input_ids': [0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 0, 0, 0, 1, 2, 3, 3, None]}


In [63]:
tokenizer.model_max_length

512

In [64]:
chunk_size = 128

In [65]:
for idx, sample in enumerate(tokenized_datasets['train'][:3]["input_ids"]):
    print(f"example {idx} has len {len(sample)}")

example 0 has len 7
example 1 has len 7
example 2 has len 10


In [66]:
tokenized_samples = tokenized_datasets["train"][:3]

concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 24'


In [67]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 24'


In [68]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Concatenate all texts
    total_length = len(concatenated_examples[list(examples.keys())[0]]) # Compute length of concatenated texts
    total_length = (total_length // chunk_size) * chunk_size # We drop the last chunk if it's smaller than chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] 
        for k, t in concatenated_examples.items()
    } # Split by chunks of max_len
    result["labels"] = result["input_ids"].copy() # Create a new labels column
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 30/30 [00:00<00:00, 9109.47 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 7366.18 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
})

In [69]:
# example after processing
print(lm_datasets["train"][0])
# print(tokenizer.decode(lm_datasets["train"][0]['input_ids']))

{'input_ids': [0, 581, 7515, 2804, 163, 8770, 2, 0, 18826, 7, 5154, 163, 8770, 2, 0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 18826, 7, 5154, 163, 8770, 2, 0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 18826, 7, 5154, 163, 8770, 2, 0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 18826, 7, 5154, 163, 8770, 2, 0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0, 18826, 7, 5154, 163, 8770, 2, 0, 717, 1098, 52825, 7, 621, 221, 163, 8770, 2, 0, 581, 7515, 2804, 163, 8770, 2, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [70]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [71]:
samples = [lm_datasets["train"][i] for i in range(1)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"{tokenizer.decode(chunk)}") # masks get added by data collator

<s> The cat said<mask>ow</s><s> Cats<mask> meow</s><s> Tokenizers are so meow</s><s><mask><mask> said meow</s><s> Cats sayຊົງow</s><s> Tokenizers areଷ୍<mask>ow</s><s> The cat said me<mask></s><s> Cat<mask> say<mask>ow</s><s> Tokenizers<mask><mask> meow</s><s> The cat said meow</s><s> Cats say meow</s><s><mask>kenizers are so meow</s><s> The cat said meow</s><s> Cats say me<mask></s><s> Tokenizers are so meow</s><s> The cat said meow</s><s>


### whole word masking thing?

In [72]:
import collections
import numpy as np
from transformers import default_data_collator

wwm_probability = 0.2

# the above collator only mask out tokens. this masks out whole word as a chunk?
def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [73]:
samples = [lm_datasets["train"][i] for i in range(1)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s> The cat said<mask><mask></s><s> Cats say meow</s><s> Tokenizers<mask> so<mask><mask></s><s> The cat said<mask><mask></s><s> Cats say meow</s><s> Tokenizers are so<mask><mask></s><s> The cat said meow</s><s> Cats say meow</s><s><mask><mask><mask><mask> are so meow</s><s> The cat said meow</s><s> Cats say meow</s><s> Tokenizers are so meow</s><s> The<mask> said meow</s><s> Cats say meow</s><s> Tokenizers are<mask><mask><mask></s><s> The<mask><mask> meow</s><s>'


### training

In [74]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1
    })
})

In [75]:
downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=1, test_size=0, seed=42
)
downsampled_dataset

ValueError: test_size=0 should be either positive and smaller than the number of samples 1 or a float in the (0, 1) range

In [82]:
from transformers import TrainingArguments

batch_size = 16

training_args = TrainingArguments(
    output_dir=f"xlmr-finetuned",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # push_to_hub=True,
    # fp16=True,
    logging_steps=1,
    num_train_epochs=10
)

In [83]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [84]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


{'eval_loss': 7.7649102210998535,
 'eval_runtime': 0.7255,
 'eval_samples_per_second': 1.378,
 'eval_steps_per_second': 1.378}

In [85]:
trainer.train()

 10%|█         | 1/10 [00:01<00:12,  1.43s/it]

{'loss': 6.7622, 'grad_norm': 88.73775482177734, 'learning_rate': 0.00045000000000000004, 'epoch': 1.0}



 10%|█         | 1/10 [00:01<00:12,  1.43s/it]

{'eval_loss': 5.512753009796143, 'eval_runtime': 0.2335, 'eval_samples_per_second': 4.283, 'eval_steps_per_second': 4.283, 'epoch': 1.0}


 20%|██        | 2/10 [00:02<00:11,  1.39s/it]

{'loss': 8.2999, 'grad_norm': 645.5491943359375, 'learning_rate': 0.0004, 'epoch': 2.0}



 20%|██        | 2/10 [00:02<00:11,  1.39s/it]

{'eval_loss': 19.468137741088867, 'eval_runtime': 0.143, 'eval_samples_per_second': 6.992, 'eval_steps_per_second': 6.992, 'epoch': 2.0}


 30%|███       | 3/10 [00:04<00:09,  1.32s/it]

{'loss': 16.947, 'grad_norm': 228.4398193359375, 'learning_rate': 0.00035, 'epoch': 3.0}



 30%|███       | 3/10 [00:04<00:09,  1.32s/it]

{'eval_loss': 14.739225387573242, 'eval_runtime': 0.172, 'eval_samples_per_second': 5.815, 'eval_steps_per_second': 5.815, 'epoch': 3.0}


 40%|████      | 4/10 [00:05<00:07,  1.28s/it]

{'loss': 10.9675, 'grad_norm': 220.02590942382812, 'learning_rate': 0.0003, 'epoch': 4.0}



 40%|████      | 4/10 [00:05<00:07,  1.28s/it]

{'eval_loss': 12.752181053161621, 'eval_runtime': 0.1465, 'eval_samples_per_second': 6.826, 'eval_steps_per_second': 6.826, 'epoch': 4.0}


 50%|█████     | 5/10 [00:06<00:06,  1.30s/it]

{'loss': 11.6536, 'grad_norm': 91.523193359375, 'learning_rate': 0.00025, 'epoch': 5.0}



 50%|█████     | 5/10 [00:06<00:06,  1.30s/it]

{'eval_loss': 10.374452590942383, 'eval_runtime': 0.1405, 'eval_samples_per_second': 7.117, 'eval_steps_per_second': 7.117, 'epoch': 5.0}


 60%|██████    | 6/10 [00:07<00:04,  1.25s/it]

{'loss': 4.3194, 'grad_norm': 87.94600677490234, 'learning_rate': 0.0002, 'epoch': 6.0}



 60%|██████    | 6/10 [00:07<00:04,  1.25s/it]

{'eval_loss': 7.055202484130859, 'eval_runtime': 0.1322, 'eval_samples_per_second': 7.566, 'eval_steps_per_second': 7.566, 'epoch': 6.0}


 70%|███████   | 7/10 [00:08<00:03,  1.22s/it]

{'loss': 6.6497, 'grad_norm': 37.982662200927734, 'learning_rate': 0.00015, 'epoch': 7.0}



 70%|███████   | 7/10 [00:09<00:03,  1.22s/it]

{'eval_loss': 6.146355628967285, 'eval_runtime': 0.1315, 'eval_samples_per_second': 7.606, 'eval_steps_per_second': 7.606, 'epoch': 7.0}


 80%|████████  | 8/10 [00:10<00:02,  1.19s/it]

{'loss': 4.8709, 'grad_norm': 18.854534149169922, 'learning_rate': 0.0001, 'epoch': 8.0}



 80%|████████  | 8/10 [00:10<00:02,  1.19s/it]

{'eval_loss': 5.589357376098633, 'eval_runtime': 0.1298, 'eval_samples_per_second': 7.704, 'eval_steps_per_second': 7.704, 'epoch': 8.0}


 90%|█████████ | 9/10 [00:11<00:01,  1.18s/it]

{'loss': 4.4352, 'grad_norm': 13.940343856811523, 'learning_rate': 5e-05, 'epoch': 9.0}



 90%|█████████ | 9/10 [00:11<00:01,  1.18s/it]

{'eval_loss': 5.70285177230835, 'eval_runtime': 0.1455, 'eval_samples_per_second': 6.874, 'eval_steps_per_second': 6.874, 'epoch': 9.0}


100%|██████████| 10/10 [00:12<00:00,  1.20s/it]

{'loss': 3.1515, 'grad_norm': 10.947586059570312, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 10/10 [00:12<00:00,  1.26s/it]

{'eval_loss': 6.405172348022461, 'eval_runtime': 0.1328, 'eval_samples_per_second': 7.532, 'eval_steps_per_second': 7.532, 'epoch': 10.0}
{'train_runtime': 12.5697, 'train_samples_per_second': 0.796, 'train_steps_per_second': 0.796, 'train_loss': 7.80570170879364, 'epoch': 10.0}





TrainOutput(global_step=10, training_loss=7.80570170879364, metrics={'train_runtime': 12.5697, 'train_samples_per_second': 0.796, 'train_steps_per_second': 0.796, 'train_loss': 7.80570170879364, 'epoch': 10.0})

In [86]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00,  6.00it/s]


{'eval_loss': 4.599571228027344,
 'eval_runtime': 0.2719,
 'eval_samples_per_second': 3.678,
 'eval_steps_per_second': 3.678,
 'epoch': 10.0}

In [87]:
model.to('cpu')
token_logits = model(**toy_input).logits
mask_token_index = torch.where(toy_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"filled: {toy_text.replace(tokenizer.mask_token, f'_{tokenizer.decode([token])}_')}")

filled: The cat said _ow_.
filled: The cat said _me_.
filled: The cat said _say_.
filled: The cat said _s_.
filled: The cat said _ken_.
filled: The cat said _are_.
filled: The cat said _the_.
filled: The cat said _said_.
filled: The cat said _The_.
filled: The cat said _so_.


# Multiple choice objective

- `run_baselines.py`

In [6]:
# toy dataset
datasets = DatasetDict({
    'train': Dataset.from_dict({'label': [1, 1, 0, 0]*10, 'input': ['The cat said meow', "Cats say meow", 'The cat said woof', 'Cats generally bark']*10}),
    'val': Dataset.from_dict({'label': [1, 0]*20, 'input': ['A sound cats like to make is meow', 'A sound cats like to make is woof']*20})
})

In [7]:
config = AutoConfig.from_pretrained('FacebookAI/xlm-roberta-base')
config.output_hidden_states = True
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMultipleChoice.from_pretrained("FacebookAI/xlm-roberta-base", config=config)
# model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")
model = model.to('cpu')
print(f'num param: {model.num_parameters()}')
print(f'the model:')
model

Some weights of XLMRobertaForMultipleChoice were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


num param: 278044417
the model:


XLMRobertaForMultipleChoice(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

In [8]:
from accelerate import Accelerator
accelerator = Accelerator()

In [9]:
data_files = {
    'train': '../langdata/en_train.csv',
    'validation': '../langdata/en_dev.csv',
    'test': '../langdata/su.csv',
}
raw_datasets = load_dataset('csv', data_files=data_files)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['startphrase', 'ending1', 'ending2', 'labels'],
        num_rows: 1458
    })
    validation: Dataset({
        features: ['startphrase', 'ending1', 'ending2', 'labels'],
        num_rows: 1094
    })
    test: Dataset({
        features: ['startphrase', 'ending1', 'ending2', 'labels'],
        num_rows: 600
    })
})

In [10]:
from itertools import chain
def preprocess_function(examples):
    column_names = ['startphrase', 'ending1', 'ending2', 'labels']
    ending_names = [f"ending{i}" for i in [1, 2]]
    context_name = "startphrase"
    label_column_name = "label" if "label" in column_names else "labels"

    first_sentences = [[context] * 2 for context in examples[context_name]]
    second_sentences = [[examples[end][i] for end in ending_names] for i in range(len(examples[context_name]))]
    labels = examples[label_column_name]

    # Flatten out
    first_sentences = list(chain(*first_sentences))
    second_sentences = list(chain(*second_sentences))

    # Tokenize
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        max_length=128,
        padding=False,
        truncation=True,
    )

    # Save the decoded sentences if storing embeddings
    # if args["do_predict"] and args["save_embeddings"]:
    #     sentence_fp = os.path.join(args["embedding_output_dir"], "sentences.tsv")
    #     with open(sentence_fp, "a") as f:
    #         for i in range(len(tokenized_examples["input_ids"])):
    #             f.write(tokenizer.decode(tokenized_examples["input_ids"][i]) + "\n")

    # Un-flatten
    tokenized_inputs = {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [11]:
processed_datasets = raw_datasets.map(
    preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names
)
processed_datasets

Map: 100%|██████████| 1094/1094 [00:00<00:00, 15298.80 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1458
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1094
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 600
    })
})

In [12]:
print(processed_datasets['train'][0])

{'labels': 0, 'input_ids': [[0, 1840, 2565, 1902, 70, 90254, 111, 140147, 6664, 5, 2, 2, 1840, 103036, 7, 831, 186, 18822, 71, 5, 2], [0, 1840, 2565, 1902, 70, 90254, 111, 140147, 6664, 5, 2, 2, 1840, 103036, 7, 53418, 186, 63207, 297, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [13]:
from dataclasses import dataclass
from transformers.utils import PaddingStrategy, get_full_repo_name

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
              if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
              lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    tokenizer: PreTrainedTokenizerBase
    padding = True # Union[bool, str, PaddingStrategy]
    max_length = None # Optional[int]
    pad_to_multiple_of = None # Optional[int]

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = list(chain(*flattened_features))

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

data_collator = DataCollatorForMultipleChoice(
    tokenizer, 
    # pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
)

In [14]:
metric = evaluate.load("accuracy")

### training

In [15]:
import random
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=64
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=64)


Sample 1323 of the training set: {'labels': 1, 'input_ids': [[0, 1529, 1556, 70, 129271, 111, 10, 165082, 29367, 109270, 142584, 2, 2, 1529, 1556, 4127, 129271, 2], [0, 1529, 1556, 70, 129271, 111, 10, 165082, 29367, 109270, 142584, 2, 2, 18763, 129271, 83, 4552, 70425, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}.
Sample 1289 of the training set: {'labels': 1, 'input_ids': [[0, 581, 56409, 5161, 17721, 70, 6626, 509, 10, 24814, 47589, 111, 124111, 2, 2, 581, 5161, 509, 6183, 56409, 2], [0, 581, 56409, 5161, 17721, 70, 6626, 509, 10, 24814, 47589, 111, 124111, 2, 2, 581, 5161, 509, 959, 4552, 56409, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}.
Sample 298 of the training set: {'labels': 0, 'input_ids': [[0, 106320, 13, 1916, 70, 5368, 18244, 95486, 674, 509, 1884, 117906, 214, 1912

In [16]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.00,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5)
lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=3,
    num_training_steps=30,
)


In [17]:
from tqdm import tqdm
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)
print("***** Running training *****")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {30}")
print(f"  Instantaneous batch size per device = {64}")
# print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
# print(f"  Gradient Accumulation steps = {args['gradient_accumulation_steps']}")
# print(f"  Total optimization steps = {args['max_train_steps']}")
# Only show the progress bar once on each machine.
# progress_bar = tqdm(range(args["max_train_steps"]), disable=not accelerator.is_local_main_process)
starting_epoch = 0


***** Running training *****
  Num examples = 1458
  Num Epochs = 30
  Instantaneous batch size per device = 64


In [20]:
import os, json
def train_model(train_dataloader, model, accelerator, optimizer, lr_scheduler, args, completed_steps, checkpointing_steps, progress_bar, eval_dataloader=None):
    model.train()
    if args["with_tracking"]:
        total_loss = 0
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        # We need to skip steps until we reach the resumed step
        # if args["resume_from_checkpoint"] and epoch == starting_epoch:
        #     if resume_step is not None and step < resume_step:
        #         completed_steps += 1
        #         continue
        outputs = model(input_ids=batch["input_ids"], 
                        attention_mask=batch["attention_mask"],
                        labels=batch["labels"])
        loss = outputs.loss
        # We keep track of the loss at each epoch
        if args["with_tracking"]:
            total_loss += loss.detach().float()
        loss = loss / args["gradient_accumulation_steps"]
        accelerator.backward(loss)
        
        if step % args["gradient_accumulation_steps"] == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if not args["silent"] and progress_bar is not None:
                progress_bar.update(1)
            completed_steps += 1

        if isinstance(checkpointing_steps, int):
            if completed_steps % checkpointing_steps == 0:
                output_dir = f"step_{completed_steps }"
                if args["output_dir"] is not None:
                    output_dir = os.path.join(args["output_dir"], output_dir)
                accelerator.save_state(output_dir)

        if completed_steps >= args["max_train_steps"]:
            break
        
        print(loss)

    return model, loss, completed_steps

def eval_model(model, eval_dataloader, metric, accelerator, epoch, args):
    model.eval()
    samples_seen = 0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = accelerator.gather((predictions, batch["labels"]))
        # If we are in a multiprocess environment, the last batch has duplicates
        if accelerator.num_processes > 1:
            if step == len(eval_dataloader) - 1:
                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                references = references[: len(eval_dataloader.dataset) - samples_seen]
            else:
                samples_seen += references.shape[0]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    accelerator.print(f"epoch {epoch}: {eval_metric}")

    return eval_metric


def main_train_loop(train_dataloader, eval_dataloader, model, tokenizer, metric, accelerator, optimizer, lr_scheduler, num_train_epochs, args, starting_epoch=0, checkpointing_steps=None, progress_bar=None):
    completed_steps = 0

    for epoch in range(starting_epoch, num_train_epochs):
        model.train()
        model, total_loss, completed_steps = train_model(train_dataloader, model, accelerator, optimizer, lr_scheduler, args, completed_steps, checkpointing_steps, progress_bar)

        eval_metric = eval_model(model, eval_dataloader, metric, accelerator, epoch, args)
    
        if args["with_tracking"]:
            accelerator.log(
                {
                    "accuracy": eval_metric,
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )

        if args["push_to_hub"] and epoch < args["num_train_epochs"] - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args["output_dir"], is_main_process=accelerator.is_main_process, save_function=accelerator.save
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args["output_dir"])
                # repo.push_to_hub(
                #     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                # )

        if args["checkpointing_steps"] == "epoch":
            output_dir = f"epoch_{epoch}"
            if args["output_dir"] is not None:
                output_dir = os.path.join(args["output_dir"], output_dir)
            accelerator.save_state(output_dir)

    if args["output_dir"] is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args["output_dir"], is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args["output_dir"])
            # if args["push_to_hub"]:
            #     repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
        with open(os.path.join(args["output_dir"], "all_results.json"), "w") as f:
            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)

    return eval_metric["accuracy"]


In [21]:
args = {
    'num_train_epochs': 30,
    'max_train_steps': 300,
    'with_tracking': False,
    'gradient_accumulation_steps': 1,
    'silent': False,
    'resume_from_checkpoint': False,
    'output_dir': 'mc-train-out',
    'checkpointing_steps': 1000,
    'push_to_hub': False,
}
main_train_loop(train_dataloader, eval_dataloader, model, tokenizer, metric, accelerator, optimizer, lr_scheduler, args["num_train_epochs"], args, starting_epoch=0, checkpointing_steps=100)

  4%|▍         | 1/23 [00:18<06:55, 18.90s/it]

tensor(0.7035, device='mps:0', grad_fn=<DivBackward0>)


  9%|▊         | 2/23 [00:37<06:35, 18.82s/it]

tensor(0.6777, device='mps:0', grad_fn=<DivBackward0>)


  9%|▊         | 2/23 [00:47<08:19, 23.76s/it]


RuntimeError: MPS backend out of memory (MPS allocated: 12.65 GB, other allocations: 5.04 GB, max allowed: 18.13 GB). Tried to allocate 732.43 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

# Multitask

In [None]:
# from claude
import torch
from transformers import XLMRobertaForMaskedLM, XLMRobertaForMultipleChoice
from transformers import AdamW, get_linear_schedule_with_warmup

# Load the pre-trained XLM-RoBERTa base model
model_mlm = XLMRobertaForMaskedLM.from_pretrained("facebook/xlm-roberta-base")
model_mc = XLMRobertaForMultipleChoice.from_pretrained("facebook/xlm-roberta-base")

# Define the multitask model
class MultiTaskModel(torch.nn.Module):
    def __init__(self, model_mlm, model_mc):
        super().__init__()
        self.base_model = model_mlm.base_model
        self.mlm_head = model_mlm.lm_head
        self.mc_head = model_mc.classifier

    def forward(self, input_ids, attention_mask, masked_lm_labels, multiple_choice_labels):
        # Pass the input through the base model
        output = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        sequence_output = output.last_hidden_state

        # Compute the Masked Language Modeling (MLM) loss
        mlm_output = self.mlm_head(sequence_output)
        mlm_loss = torch.nn.functional.cross_entropy(mlm_output.view(-1, mlm_output.size(-1)), masked_lm_labels.view(-1))

        # Compute the Multiple Choice loss
        mc_output = self.mc_head(sequence_output[:, 0, :])  # Use the [CLS] token for multiple choice
        mc_loss = torch.nn.functional.cross_entropy(mc_output, multiple_choice_labels)

        # Combine the losses
        total_loss = mlm_loss + mc_loss
        return total_loss

# Instantiate the multitask model
model = MultiTaskModel(model_mlm, model_mc)

# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

# Training loop
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        masked_lm_labels = batch['masked_lm_labels']
        multiple_choice_labels = batch['multiple_choice_labels']

        optimizer.zero_grad()
        loss = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            masked_lm_labels=masked_lm_labels,
            multiple_choice_labels=multiple_choice_labels
        )
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
# from gpt 3.5
import torch
from transformers import XLMRobertaForMaskedLM, XLMRobertaForMultipleChoice, XLMRobertaTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset

# Load tokenizer and base model
tokenizer = XLMRobertaTokenizer.from_pretrained('facebook/xlm-roberta-base')
model = XLMRobertaForMaskedLM.from_pretrained('facebook/xlm-roberta-base')

# Clone MLM head and assign it to multiple-choice head
mlm_head = model.cls  # Clone the MLM head
mc_head = torch.nn.Linear(model.config.hidden_size, num_choices)  # num_choices is the number of answer choices

# Define optimizer
optimizer = AdamW([
    {'params': model.parameters()},
    {'params': mc_head.parameters()}
], lr=5e-5)  # Adjust learning rate as needed

# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:  # dataloader for a combined dataset with MLM and multiple-choice samples
        inputs, mlm_labels, mc_labels = batch
        
        # Forward pass for MLM
        mlm_outputs = model(**inputs)
        mlm_logits = mlm_outputs.logits
        
        # Calculate MLM loss
        mlm_loss = torch.nn.CrossEntropyLoss()(mlm_logits.view(-1, tokenizer.vocab_size), mlm_labels.view(-1))
        
        # Forward pass for multiple-choice
        mc_outputs = model.roberta(**inputs)  # Exclude the MLM head
        pooled_output = mc_outputs.pooler_output
        mc_logits = mc_head(pooled_output)
        
        # Calculate multiple-choice loss
        mc_loss = torch.nn.CrossEntropyLoss()(mc_logits, mc_labels)
        
        # Combined loss
        combined_loss = mlm_loss + mc_loss
        
        # Backward pass and optimization
        combined_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluation and validation similar to the previous approach


---

# graveyard

In [54]:
print(model)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [55]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    # target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    task_type=TaskType.FEATURE_EXTRACTION,
)

In [56]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters() # see % trainable parameters

trainable params: 589,824 || all params: 278,633,472 || trainable%: 0.21168454592562375


In [57]:
print(lora_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=

In [58]:
tokenized_datasets = tokenized_datasets.rename_column("example", "text")
tokenized_datasets

Dataset({
    features: ['score', 'text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [59]:
toy_input = tokenizer(tokenized_datasets['text'][0][:100], return_tensors="pt")
toy_input = toy_input.to('cpu')
lora_model = lora_model.to('cpu')

In [60]:
lora_model(**toy_input)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0873,  0.1106,  0.0666,  ..., -0.0501,  0.0721, -0.0197],
         [-0.0081, -0.1164,  0.0249,  ..., -0.0507,  0.0636,  0.0125],
         [-0.0304,  0.1153,  0.0092,  ..., -0.0534, -0.0404,  0.1008],
         ...,
         [ 0.0829,  0.0443,  0.0210,  ..., -0.1380, -0.0108,  0.1600],
         [ 0.0543,  0.0591,  0.0068,  ..., -0.1866,  0.0065, -0.0482],
         [ 0.0727,  0.1022,  0.0085,  ..., -0.1361,  0.0007,  0.0193]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-3.0522e-02,  2.7087e-01,  1.1834e-01,  5.1281e-01,  4.8539e-03,
          3.6024e-01,  4.2014e-01, -4.4383e-01,  1.6263e-01, -1.5847e-01,
          1.3781e-01,  9.1416e-02,  3.8526e-01,  3.3253e-01, -2.0363e-01,
         -1.7617e-01,  1.8961e-01,  4.2476e-01, -7.3580e-02, -2.2573e-01,
         -2.4010e-01,  3.4361e-01, -6.7378e-01, -5.5283e-01, -2.1458e-01,
          5.7239e-01,  1.3006e-01, -3.1152e-01, -1.3167e-01,  6.425

In [47]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [48]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="test_trainer", 
        evaluation_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=16,
    ),
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/7 [06:36<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

TypeError: XLMRobertaModel.forward() got an unexpected keyword argument 'labels'

## copy paste

In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
toy_text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import torch

toy_input = tokenizer(toy_text, return_tensors="pt")
token_logits = model(**toy_input).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(toy_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {toy_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result
# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 15.7MB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:01<00:00, 18.4MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 25.3MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:01<00:00, 32.1MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 325430.46 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 386076.48 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 383714.98 examples/s]
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4874.58 examples/s]
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4939.38 examples/s]
Map: 100%|██████████| 50000/50000 [00:13<00:00, 3753.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [9]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 25000/25000 [00:41<00:00, 601.71 examples/s]
Map: 100%|██████████| 25000/25000 [00:40<00:00, 612.41 examples/s]
Map: 100%|██████████| 50000/50000 [01:24<00:00, 589.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [11]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented [MASK] am curious - yellow from my video store because of all the [MASK] that surrounded it when it was first released in 1967. i also heard that at first it was seized by [MASK]. s. customs if [MASK] ever [MASK] [MASK] enter this country, therefore [MASK] a fan of films [MASK] "ree [MASK] i really had to see this for myself. < br / > < br / [MASK] the [MASK] is centered around a young swedish [MASK] student named lena [MASK] [MASK] to learn everything she can about life. in particular she wants to [MASK] her attentions to making some sort of documentary on what the [MASK] sw [MASK] thought about certain political [MASK] such'

'>>> as the vietnam war and race issues in the united states. in [MASK] asking [MASK] and ordinary denizens of stockholm about [MASK] opinions on politics, she [MASK] sex president her drama teacher, classmates, and married men. < br / > < [MASK] / > what kills me about i am curious - yellow is [MASK] 40orth ago [MASK] this was [MASK] pornog