In [3]:
from transformers import AutoModelForMaskedLM

model_checkpoint = r"D:\huggingface\distilbert\distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [4]:
distilbert_num_parameters = model.num_parameters() / 1000000
print(f"DistilBERT number of parameters:{round(distilbert_num_parameters)}M")
print(f"BERT number of parameters:110M")

DistilBERT number of parameters:67M
BERT number of parameters:110M


In [7]:
text = "This is a great [MASK]."

from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(text, return_tensors='pt')
token_logits = model(**inputs).logits
print(inputs['input_ids'].shape)
print(token_logits.shape)


mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
print(mask_token_logits.shape)


top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"{text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

torch.Size([1, 8])
torch.Size([1, 8, 30522])
torch.Size([1, 30522])
This is a great deal.
This is a great success.
This is a great adventure.
This is a great idea.
This is a great feat.


## preparing data

In [8]:
from datasets import load_dataset
from datasets import load_from_disk

dataset_path = r'D:\huggingface\datasets\imdb'
# imdb_dataset = load_dataset("imdb")
imdb_dataset = load_from_disk(dataset_path)
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
sample = imdb_dataset['train'].shuffle(seed=42).select(range(3))
sample

Dataset({
    features: ['text', 'label'],
    num_rows: 3
})

In [10]:
for row in sample:
    print(f"Review:{row['text']}")
    print(f"Label:{row['label']}")

Review:There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...
Label:1
Review:This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stub your toe on the moon

In [11]:
imdb_dataset['unsupervised'][0]

{'text': 'This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she\'s speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you\'ll regret later :)',
 'label': -1}

In [12]:
def tokenize_function(examples):
    result = tokenizer(examples['text']) # without trunction=True
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=['text', 'label']
)

tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [13]:
tokenizer.model_max_length

512

In [16]:
chunk_size = 128

tokenized_samples = tokenized_datasets['train'][210:215]
for idx, sample in enumerate(tokenized_samples['input_ids']):
    print(f"Review {idx} length:{len(sample)}")

Review 0 length:107
Review 1 length:150
Review 2 length:112
Review 3 length:640
Review 4 length:235


In [17]:
concatenated_examples = {
    k:sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}

total_length = len(concatenated_examples['input_ids'])
print(f"Concatenated reviews length:{total_length}")

Concatenated reviews length:1244


In [18]:
concatenated_examples

{'input_ids': [101,
  1037,
  11519,
  8297,
  1010,
  2021,
  2515,
  2025,
  5308,
  1996,
  8595,
  1997,
  1996,
  2434,
  1012,
  1037,
  25303,
  11167,
  1006,
  20128,
  5912,
  1007,
  15980,
  2047,
  15702,
  1999,
  2344,
  2000,
  3622,
  2010,
  2219,
  3117,
  6644,
  2011,
  1996,
  2697,
  1012,
  2145,
  18101,
  4288,
  1010,
  2021,
  3185,
  3849,
  2200,
  4416,
  1011,
  1999,
  1011,
  5048,
  1012,
  2151,
  8562,
  2003,
  2025,
  1997,
  1996,
  6057,
  2785,
  1012,
  2561,
  2622,
  3849,
  2000,
  2031,
  1996,
  3737,
  1997,
  1037,
  4248,
  2666,
  1998,
  2012,
  2335,
  5912,
  2003,
  2126,
  2058,
  1996,
  2327,
  1012,
  2023,
  3185,
  2003,
  2055,
  1037,
  5896,
  2108,
  2128,
  15773,
  2077,
  2183,
  2000,
  1996,
  3898,
  1012,
  1012,
  1012,
  2023,
  2323,
  2031,
  3047,
  2000,
  2023,
  5896,
  1012,
  102,
  101,
  2709,
  2000,
  6644,
  2011,
  1996,
  2697,
  2515,
  2025,
  1010,
  1999,
  2151,
  2126,
  1010,
  3233,
  2039

In [19]:
concatenated_examples.keys()

dict_keys(['input_ids', 'attention_mask', 'word_ids'])

In [20]:
chunks = {
    k:[t[i:i+chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks['input_ids']:
    print(f"chunk length:{len(chunk)}")

chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:128
chunk length:92


In [21]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k:sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # split by chunks of max_len
    result = {
        k:[t[i:i+chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result['labels'] = result['input_ids'].copy()

    return result


lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [22]:
len(lm_datasets['train'][1]['input_ids'])

128

In [25]:
lm_datasets['train'][0]['input_ids']

[101,
 1045,
 12524,
 1045,
 2572,
 8025,
 1011,
 3756,
 2013,
 2026,
 2678,
 3573,
 2138,
 1997,
 2035,
 1996,
 6704,
 2008,
 5129,
 2009,
 2043,
 2009,
 2001,
 2034,
 2207,
 1999,
 3476,
 1012,
 1045,
 2036,
 2657,
 2008,
 2012,
 2034,
 2009,
 2001,
 8243,
 2011,
 1057,
 1012,
 1055,
 1012,
 8205,
 2065,
 2009,
 2412,
 2699,
 2000,
 4607,
 2023,
 2406,
 1010,
 3568,
 2108,
 1037,
 5470,
 1997,
 3152,
 2641,
 1000,
 6801,
 1000,
 1045,
 2428,
 2018,
 2000,
 2156,
 2023,
 2005,
 2870,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 1996,
 5436,
 2003,
 8857,
 2105,
 1037,
 2402,
 4467,
 3689,
 3076,
 2315,
 14229,
 2040,
 4122,
 2000,
 4553,
 2673,
 2016,
 2064,
 2055,
 2166,
 1012,
 1999,
 3327,
 2016,
 4122,
 2000,
 3579,
 2014,
 3086,
 2015,
 2000,
 2437,
 2070,
 4066,
 1997,
 4516,
 2006,
 2054,
 1996,
 2779,
 25430,
 14728,
 2245,
 2055,
 3056,
 2576,
 3314,
 2107]

In [24]:
lm_datasets['train'][1]['input_ids']

[2004,
 1996,
 5148,
 2162,
 1998,
 2679,
 3314,
 1999,
 1996,
 2142,
 2163,
 1012,
 1999,
 2090,
 4851,
 8801,
 1998,
 6623,
 7939,
 4697,
 3619,
 1997,
 8947,
 2055,
 2037,
 10740,
 2006,
 4331,
 1010,
 2016,
 2038,
 3348,
 2007,
 2014,
 3689,
 3836,
 1010,
 19846,
 1010,
 1998,
 2496,
 2273,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 2054,
 8563,
 2033,
 2055,
 1045,
 2572,
 8025,
 1011,
 3756,
 2003,
 2008,
 2871,
 2086,
 3283,
 1010,
 2023,
 2001,
 2641,
 26932,
 1012,
 2428,
 1010,
 1996,
 3348,
 1998,
 16371,
 25469,
 5019,
 2024,
 2261,
 1998,
 2521,
 2090,
 1010,
 2130,
 2059,
 2009,
 1005,
 1055,
 2025,
 2915,
 2066,
 2070,
 10036,
 2135,
 2081,
 22555,
 2080,
 1012,
 2096,
 2026,
 2406,
 3549,
 2568,
 2424,
 2009,
 16880,
 1010,
 1999,
 4507,
 3348,
 1998,
 16371,
 25469,
 2024,
 1037,
 2350,
 18785,
 1999,
 4467,
 5988,
 1012,
 2130,
 13749,
 7849,
 24544,
 1010]

In [23]:
tokenizer.decode(lm_datasets['train'][1]['input_ids'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [26]:
tokenizer.decode(lm_datasets['train'][1]['labels'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

## finetuning with trainer API

In [27]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [28]:
samples = [lm_datasets['train'][i] for i in range(2)]
for sample in samples:
    _ = sample.pop('word_ids')

samples

[{'input_ids': [101,
   1045,
   12524,
   1045,
   2572,
   8025,
   1011,
   3756,
   2013,
   2026,
   2678,
   3573,
   2138,
   1997,
   2035,
   1996,
   6704,
   2008,
   5129,
   2009,
   2043,
   2009,
   2001,
   2034,
   2207,
   1999,
   3476,
   1012,
   1045,
   2036,
   2657,
   2008,
   2012,
   2034,
   2009,
   2001,
   8243,
   2011,
   1057,
   1012,
   1055,
   1012,
   8205,
   2065,
   2009,
   2412,
   2699,
   2000,
   4607,
   2023,
   2406,
   1010,
   3568,
   2108,
   1037,
   5470,
   1997,
   3152,
   2641,
   1000,
   6801,
   1000,
   1045,
   2428,
   2018,
   2000,
   2156,
   2023,
   2005,
   2870,
   1012,
   1026,
   7987,
   1013,
   1028,
   1026,
   7987,
   1013,
   1028,
   1996,
   5436,
   2003,
   8857,
   2105,
   1037,
   2402,
   4467,
   3689,
   3076,
   2315,
   14229,
   2040,
   4122,
   2000,
   4553,
   2673,
   2016,
   2064,
   2055,
   2166,
   1012,
   1999,
   3327,
   2016,
   4122,
   2000,
   3579,
   2014,
   3086,
   20

In [29]:
for chunk in data_collator(samples)['input_ids']: # token mask
    print(f"{tokenizer.decode(chunk)}")

[CLS] i rented i am curious - yellow from my video [MASK] because of all the [MASK] that surrounded it when [MASK] was first released in 1967. i also heard that at first it was seized by u. s. customs if it [MASK] [MASK] [MASK] enter this country, therefore being [MASK] fan of films considered " controversial " i really had to see this for myself [MASK] [MASK] br / > [MASK] br / > the plot is centered around a young swedish drama student named lena [MASK] wants to yemen everything she can about life. in particular she wants to focus her attentions to [MASK] some sort stale documentary on what the average sw [MASK] thought about certain political issues such
as the vietnam war and race issues in the united states. in between [MASK] politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama [MASK] [MASK] classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is [MASK] 40 years ago, this was [MASK] pornogra

In [32]:
# 全词掩码 whole word mask
import collections
import numpy as np
from transformers import default_data_collator
wwm_probability = 0.2

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop('word_ids')

        # create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None: # special tokens
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)


        # randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature['input_ids']
        labels = feature['labels']
        new_labels = [-100] * len(labels) # labels are all -100 except for the ones corresponding to mask words.
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature['labels'] = new_labels

    return default_data_collator(features)





In [33]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i [MASK] i am curious - yellow from my video store because [MASK] all the controversy that surrounded it when it was first released in 1967. i also [MASK] that at first it was [MASK] by [MASK]. s. customs if it ever [MASK] to enter this [MASK], therefore being [MASK] fan of [MASK] considered " controversial " i [MASK] had to see this for [MASK]. < br / > < br / > [MASK] plot is centered around a young swedish drama student named lena [MASK] wants to [MASK] everything she [MASK] [MASK] life. in particular she wants to [MASK] her attentions to [MASK] some sort of documentary on what the [MASK] swede thought about certain political issues such'

'>>> [MASK] the vietnam [MASK] and race issues in the united [MASK]. in [MASK] [MASK] politicians [MASK] ordinary denizens of [MASK] about [MASK] opinions on politics, she has sex with her drama teacher, classmates, and married [MASK]. < br / > < br / [MASK] what kills me about i am curious - [MASK] [MASK] that 40 [MASK] ago, this was 

In [34]:
train_size = 10000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets['train'].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [35]:
from transformers import TrainingArguments, Trainer

batch_size = 64
logging_steps = len(downsampled_dataset['train']) // batch_size
model_name = model_checkpoint.split('/')[-1]

training_args = TrainingArguments(
    output_dir=f'D:\huggingface\{model_name}-finetuned-imdb', # no 中文路径
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset['train'],
    eval_dataset=downsampled_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)


import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # 21.75






ConnectionError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/repos/create (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207523A7970>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。'))"), '(Request ID: 5ecfd10c-afb0-4cd7-be2d-eb84dc1bc6f9)')

In [None]:
trainer.train()
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # 11.32

## finetuning with accelerate

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}


downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)


from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset['train'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=default_data_collator)

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)


from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

from tqdm import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[:len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(loss))
    except OverflowError:
        perplexity = float('inf')

    print(f"epoch {epoch}, perplexity:{perplexity}")

    