- https://huggingface.co/docs/transformers/en/training
- https://huggingface.co/docs/transformers/en/peft
- https://huggingface.co/docs/peft/quicktour
- https://jaotheboss.medium.com/peft-with-bert-8763d8b8a4ca
- https://huggingface.co/learn/nlp-course/en/chapter7/3

In [46]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_from_disk
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    SchedulerType,
    default_data_collator,
    get_scheduler,
    AutoModel,
    XLMRobertaTokenizer,
    XLMRobertaXLModel,
    AutoModelForMaskedLM,
    XLMRobertaXLConfig,
    XLMRobertaXLForMultipleChoice
)
from torch.utils.data import DataLoader
import torch
from transformers import DataCollatorForLanguageModeling

In [2]:
LANG_CODE = 'yo'
DSETSIZE = 10000
SCORER = 'bm25'
NUM_EXAMPLES = 100
dataset = load_from_disk(f"../culturaldataset/select_datasets/{LANG_CODE}/{SCORER}-{DSETSIZE}")

In [3]:
dataset[0]

{'score': 0.781022583199985,
 'example': 'AfonrereYORUBA Gbode\nE kaabo si Afonrere YORUBA\nIwe Irohin fun imugbooro, ilosiwaju ati idagbasoke Yoruba\nÌwé Ìròhìn fún Àmúgbòòrò, Ìlọsíwájú àti Ìdàgbàsókè Yorùbá\nẸ káábọ̀, ẹ̀yin alárá wa. Ẹ̀yà Yorùbá wà káàkiri gbogbo àgbáyé. Wọ́n pọ̀, wọ́n gbọ́n, wọ́n ní òye, àsà, ọ̀làjú, ètò bí a se ń se Ìlú àti àkóso. Bẹ́ẹ̀ni wọ́n sì jáfáfá. A kò se iyèméjì pé bí ẹ̀yà Yorùbá bá rí ọwọ́ mú lágbàńlá-ayé, Aláwọ̀dúdú rí ọwọ́ mú nìyan. Ìdà kejì ọ̀rọ̀ yìi rí bẹ́ẹ̀, àmọ́ a ò gbàdúrà rẹ̀.\nÀdúrà nìkan kò tó sá. A gbọdọ̀ sisé tọ̀ ọ́ ni. Èyí ló gbún wa ní kẹ́sẹ́ láti dá Ìwé Ìròhìn Afọnrere YORÙBÁ ỳií sílẹ̀. Èdè Yorùbá ni a ó maa fi kọ ọ́. A ó ma tu díẹ̀ nínù Ìròhìn wa sí èdè Gẹ̀ẹ́sì, èdè Faransé àti Śpáníisì ní Ìdákọ̀ọ̀kan.\nKí ẹ máa bá wa kálọ.\nẸ fi Ìwé yín sọwọ́ sí wa ní afonrereyoruba@yahoo.com\nJÀRE ÀJÀYÍ\n

In [37]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModel.from_pretrained("FacebookAI/xlm-roberta-base")
# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")
model.to('cpu')
tokenizer.to('cpu')

AttributeError: 'XLMRobertaTokenizerFast' object has no attribute 'to'

In [53]:
def tokenize_function(examples):
    return tokenizer(examples["example"], padding="max_length", truncation=True, return_tensors="pt")
tokenized_dataset = dataset.select(range(NUM_EXAMPLES)).map(tokenize_function, batched=True)
tokenized_dataset


[A
Map: 100%|██████████| 100/100 [00:00<00:00, 402.35 examples/s]


Dataset({
    features: ['score', 'example', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [54]:
print(model)

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [55]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    # target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    task_type=TaskType.FEATURE_EXTRACTION,
)

In [56]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters() # see % trainable parameters

trainable params: 589,824 || all params: 278,633,472 || trainable%: 0.21168454592562375


In [57]:
print(lora_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=

In [58]:
tokenized_dataset = tokenized_dataset.rename_column("example", "text")
tokenized_dataset

Dataset({
    features: ['score', 'text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [59]:
inputs = tokenizer(tokenized_dataset['text'][0][:100], return_tensors="pt")
inputs = inputs.to('cpu')
lora_model = lora_model.to('cpu')

In [60]:
lora_model(**inputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0873,  0.1106,  0.0666,  ..., -0.0501,  0.0721, -0.0197],
         [-0.0081, -0.1164,  0.0249,  ..., -0.0507,  0.0636,  0.0125],
         [-0.0304,  0.1153,  0.0092,  ..., -0.0534, -0.0404,  0.1008],
         ...,
         [ 0.0829,  0.0443,  0.0210,  ..., -0.1380, -0.0108,  0.1600],
         [ 0.0543,  0.0591,  0.0068,  ..., -0.1866,  0.0065, -0.0482],
         [ 0.0727,  0.1022,  0.0085,  ..., -0.1361,  0.0007,  0.0193]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-3.0522e-02,  2.7087e-01,  1.1834e-01,  5.1281e-01,  4.8539e-03,
          3.6024e-01,  4.2014e-01, -4.4383e-01,  1.6263e-01, -1.5847e-01,
          1.3781e-01,  9.1416e-02,  3.8526e-01,  3.3253e-01, -2.0363e-01,
         -1.7617e-01,  1.8961e-01,  4.2476e-01, -7.3580e-02, -2.2573e-01,
         -2.4010e-01,  3.4361e-01, -6.7378e-01, -5.5283e-01, -2.1458e-01,
          5.7239e-01,  1.3006e-01, -3.1152e-01, -1.3167e-01,  6.425

In [47]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [48]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="test_trainer", 
        evaluation_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=16,
    ),
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/7 [06:36<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

TypeError: XLMRobertaModel.forward() got an unexpected keyword argument 'labels'

## copy paste

In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result
# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 15.7MB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:01<00:00, 18.4MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 25.3MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:01<00:00, 32.1MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 325430.46 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 386076.48 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 383714.98 examples/s]
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4874.58 examples/s]
Map: 100%|██████████| 25000/25000 [00:05<00:00, 4939.38 examples/s]
Map: 100%|██████████| 50000/50000 [00:13<00:00, 3753.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [9]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map: 100%|██████████| 25000/25000 [00:41<00:00, 601.71 examples/s]
Map: 100%|██████████| 25000/25000 [00:40<00:00, 612.41 examples/s]
Map: 100%|██████████| 50000/50000 [01:24<00:00, 589.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [11]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented [MASK] am curious - yellow from my video store because of all the [MASK] that surrounded it when it was first released in 1967. i also heard that at first it was seized by [MASK]. s. customs if [MASK] ever [MASK] [MASK] enter this country, therefore [MASK] a fan of films [MASK] "ree [MASK] i really had to see this for myself. < br / > < br / [MASK] the [MASK] is centered around a young swedish [MASK] student named lena [MASK] [MASK] to learn everything she can about life. in particular she wants to [MASK] her attentions to making some sort of documentary on what the [MASK] sw [MASK] thought about certain political [MASK] such'

'>>> as the vietnam war and race issues in the united states. in [MASK] asking [MASK] and ordinary denizens of stockholm about [MASK] opinions on politics, she [MASK] sex president her drama teacher, classmates, and married men. < br / > < [MASK] / > what kills me about i am curious - yellow is [MASK] 40orth ago [MASK] this was [MASK] pornog