In [1]:
import os 

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets = load_dataset('wikitext', 'wikitext-103-raw-v1')
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [3]:
datasets_i = load_dataset('imdb')
datasets_i = datasets_i.remove_columns("label")

# Splitting unsupervised dataset into train/val/test (80/10/10)
train_size = int(len(datasets_i["unsupervised"]) * 0.9)
val_size = int(len(datasets_i["unsupervised"]) * 0.05)
test_size = len(datasets_i["unsupervised"]) - train_size - val_size

datasets_i["train"] = datasets_i["unsupervised"].select(range(train_size))
datasets_i["validation"] = datasets_i["unsupervised"].select(range(train_size, train_size + val_size))
datasets_i["test"] = datasets_i["unsupervised"].select(range(train_size + val_size, train_size + val_size + test_size))

datasets_i

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2500
    })
    unsupervised: Dataset({
        features: ['text'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2500
    })
})

In [16]:
model_checkpoint = "./distilgpt2-finetuned-imdb/checkpoint-13183"
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [18]:
tokenized_datasets_i = datasets_i.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
tokenized_datasets_i

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2500
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [19]:
# block_size = tokenizer.model_max_length
block_size = 128

In [8]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    total_length = (total_length // block_size) * block_size

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

lm_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2209
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 920359
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1931
    })
})

In [20]:
lm_datasets_i = tokenized_datasets_i.map(
    group_texts,
	batched=True,
	batch_size=1000,
	num_proc=4,
)

lm_datasets_i

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 105457
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5876
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 117365
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6030
    })
})

In [21]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [11]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
	f"{model_name}-finetuned-wikitext103",
	evaluation_strategy="epoch",
	num_train_epochs=3, 
	learning_rate=2e-5,
	weight_decay=0.01,
	push_to_hub=False,
	save_strategy="epoch",  # Save after each epoch
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets_i["train"],
    eval_dataset=lm_datasets_i["validation"],
)

In [25]:
eval_results_before = trainer.evaluate()
print(f"Perplexity before training: {math.exp(eval_results_before['eval_loss']):.2f}")

Perplexity before training: 48.09


In [12]:
trainer.train()

trainer.save_model(f"{model_name}-finetuned-wikitext103_fin")
eval_results_after = trainer.evaluate()
print(f"Perplexity after training: {math.exp(eval_results_after['eval_loss']):.2f}")

Epoch,Training Loss,Validation Loss
1,3.4825,3.351699
2,3.4221,3.30482
3,3.3813,3.288763


Perplexity after training: 26.81


In [1]:
eval_results_test = trainer.evaluate(eval_dataset=lm_datasets_i["test"])
print(f"Perplexity on wikitext103 test dataset: {math.exp(eval_results_test['eval_loss']):.2f}")


NameError: name 'trainer' is not defined