In [1]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [2]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from datasets import load_dataset, DatasetDict

train_dataset = load_dataset('json', data_files='./friends-1-227-Rachel-pair.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='./eval.jsonl', split='train')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
train_dataset_restructured = train_dataset.map(lambda x: {'question': x['0']['dialogue'], 'answer': x['1']['dialogue']})
eval_dataset_restructured = eval_dataset.map(lambda x: {'question': x['0']['dialogue'], 'answer': x['1']['dialogue']})


data = DatasetDict({
    "train": train_dataset_restructured,
    "test": eval_dataset_restructured
})

data = data.remove_columns(['0', '1'])
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 4503
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 237
    })
})

In [8]:
sample = data["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Question: {row['question']}'")
    print(f"\n'>>> Answer: {row['answer']}'")


'>>> Question: And I got them a book on Karma Sutra for the elderly.'

'>>> Answer: Hey, do you guys have any extra ribbon?'

'>>> Question: Hi!'

'>>> Answer: And you know Monica and Ross!'

'>>> Question: Oh yeah!'

'>>> Answer: Tails!'


In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["answer"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = data.map(
    tokenize_function, batched=True, remove_columns=["question", "answer"]
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 4503
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 237
    })
})

In [10]:
tokenizer.model_max_length

512

In [11]:
chunk_size = 128

In [12]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Answer {idx} length: {len(sample)}'")

'>>> Answer 0 length: 13'
'>>> Answer 1 length: 9'
'>>> Answer 2 length: 20'


In [13]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated answers length: {total_length}'")

'>>> Concatenated answers length: 42'


In [14]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 42'


In [15]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 375
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 19
    })
})

In [17]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

"[SEP] [CLS] a lot. [SEP] [CLS] so, got any advice? y'know, as someone who's recently been - dumped? [SEP] [CLS] remember when we were in high school together? [SEP] [CLS] oh! man, i never thought i'd be here.. [SEP] [CLS] are you sure? [SEP] [CLS] i'm - uh - i'm okay... you look great! [SEP] [CLS] oh, not much. i - i got a job. [SEP] [CLS] why are - why are you so tanned? [SEP] [CLS] oh no. you went on our honeymoon alone? [SEP] [CLS] mindy?! my maid of honour, mind"

In [18]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

"[SEP] [CLS] a lot. [SEP] [CLS] so, got any advice? y'know, as someone who's recently been - dumped? [SEP] [CLS] remember when we were in high school together? [SEP] [CLS] oh! man, i never thought i'd be here.. [SEP] [CLS] are you sure? [SEP] [CLS] i'm - uh - i'm okay... you look great! [SEP] [CLS] oh, not much. i - i got a job. [SEP] [CLS] why are - why are you so tanned? [SEP] [CLS] oh no. you went on our honeymoon alone? [SEP] [CLS] mindy?! my maid of honour, mind"

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [20]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] everything you need to know is in that [MASK] kiss. [SEP] [CLS] has anybody seen my engagement ring? [SEP] [CLS] oh [MASK], [MASK]ttes [MASK] oh god oh god oh明 oh [MASK].... [SEP] [CLS] [MASK]hhhh, don't be mad... [SEP] [CLS] oh, i am sorry [MASK].. [SEP] [CLS] [MASK] [MASK] but look how [MASK] those hovered are [MASK] [SEP] [CLS] so what are [MASK] gonna do? [SEP] [CLS] you're twins? [SEP] [CLS] all right, [MASK] guys [MASK] i kinda gotta [MASK] up now. [SEP] [CLS] i'm [MASK] cleaning up [MASK] [SEP] [CLS] uh [MASK]. okay, sure! [MASK]!'

'>>> [SEP] [CLS] [MASK] lot. [SEP] [CLS] [MASK], got any advice? y'know, [MASK] someone who's [MASK] been [MASK] dumped? [SEP] [CLS] remember when we were in high school together? [SEP] [CLS] oh! man, i never thought i'd be here.. [SEP] [CLS] are [MASK] sure? [SEP] [CLS] i'm - uh - i'm okay.. [MASK] you look pup! [SEP] [CLS] [MASK] [MASK] not much. i - i got [MASK] job. [SEP] [CLS] why are - why are you [MASK] tanned? [SEP] [CLS] [MASK] n

In [21]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [22]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] [MASK] you need to know is in [MASK] [MASK] [MASK]. [SEP] [CLS] has anybody seen my engagement ring [MASK] [SEP] [CLS] oh god, oh god, oh god oh [MASK] oh god oh god... [MASK] [SEP] [CLS] [MASK] [MASK] [MASK], [MASK] [MASK] t be mad... [SEP] [CLS] oh, [MASK] am sorry. [MASK]. [SEP] [CLS] oh, but look [MASK] straight [MASK] [MASK] are! [SEP] [CLS] [MASK] what are you gonna do? [SEP] [CLS] you're twins [MASK] [SEP] [CLS] all [MASK], you guys, i kinda gotta clean up [MASK]. [SEP] [CLS] i'm just cleaning [MASK]. [SEP] [CLS] uh.. [MASK], sure! thanks [MASK]'

'>>> [SEP] [CLS] a lot. [SEP] [CLS] [MASK] [MASK] got any advice? y'[MASK], as [MASK] who [MASK] [MASK] recently been - dumped? [SEP] [CLS] remember [MASK] we were in high school together? [SEP] [CLS] oh [MASK] man [MASK] i never thought i'd be here. [MASK] [SEP] [CLS] are you sure? [SEP] [CLS] i [MASK] m - uh - i'm okay. [MASK] [MASK] [MASK] look great! [SEP] [CLS] oh, not much. i [MASK] i got a job [MASK] [SEP] [CLS] why 

In [23]:
train_size = 300
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 300
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 30
    })
})

In [24]:
from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain,s

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-rachel-mask",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    # fp16=True,
    logging_steps=logging_steps,
)

In [26]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


In [27]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/1 [00:00<?, ?it/s]

>>> Perplexity: 1.44


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]

{'loss': 0.3797, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.310154527425766, 'eval_runtime': 0.2896, 'eval_samples_per_second': 103.605, 'eval_steps_per_second': 3.454, 'epoch': 1.0}
{'loss': 0.3423, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2973397970199585, 'eval_runtime': 1.5496, 'eval_samples_per_second': 19.359, 'eval_steps_per_second': 0.645, 'epoch': 2.0}
{'loss': 0.3063, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.28836187720298767, 'eval_runtime': 1.4308, 'eval_samples_per_second': 20.967, 'eval_steps_per_second': 0.699, 'epoch': 3.0}
{'train_runtime': 49.8657, 'train_samples_per_second': 18.048, 'train_steps_per_second': 0.301, 'train_loss': 0.3375093460083008, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=0.3375093460083008, metrics={'train_runtime': 49.8657, 'train_samples_per_second': 18.048, 'train_steps_per_second': 0.301, 'train_loss': 0.3375093460083008, 'epoch': 3.0})

In [83]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/1 [00:00<?, ?it/s]

>>> Perplexity: 1.32


In [84]:
trainer.push_to_hub()

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.35k [00:00<?, ?B/s]

events.out.tfevents.1709724809.Emirs-MacBook-Pro.local.74809.1:   0%|          | 0.00/354 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1709724731.Emirs-MacBook-Pro.local.74809.0:   0%|          | 0.00/6.16k [00:00<?, ?B/s]

KeyboardInterrupt: 

In [30]:
text = "hi i am [MASK]"

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

trainer.predict(token_logits)

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [29]:
text = "hi i am [MASK]"

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

RuntimeError: Placeholder storage has not been allocated on MPS device!