### Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Dependencies

In [11]:
# %pip install evaluate
# %pip install rouge_score

In [4]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline
import evaluate
import math
import torch


### Load Dataset

In [5]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", cache_dir="./data")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

### Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
        return_attention_mask=True
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Small subsets becuse of VRAM problem
train_subset = dataset["train"].select(range(1000))
eval_subset = dataset["validation"].select(range(50))

tokenized_train = train_subset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_subset.map(tokenize_function, batched=True, remove_columns=["text"])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

### Load model

In [8]:
model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir='./model')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Trainer

#### Metric function

In [12]:
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    preds = torch.tensor(logits).argmax(dim=-1)

    labels = torch.tensor(labels)
    labels[labels == -100] = tokenizer.pad_token_id

    shift_logits = torch.tensor(logits)[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    loss = loss_fct(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1)
    )
    perplexity = math.exp(loss.item()) if loss.item() < 20 else float("inf")

    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_result = rouge_metric.compute(predictions=pred_texts, references=label_texts)
    bleu_result = bleu_metric.compute(predictions=pred_texts, references=label_texts)

    return {
        "perplexity": perplexity,
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }

#### Training Arguments

In [15]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    eval_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    logging_steps=100,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics
)

In [16]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Perplexity,Rouge1,Rouge2,Rougel,Bleu
1,0.8129,0.614079,34.346258,0.199797,0.044497,0.152788,0.081564
2,0.72,0.603646,32.390386,0.201504,0.047546,0.157798,0.087873
3,0.7411,0.599084,31.605668,0.198765,0.046129,0.156663,0.085535
4,0.8567,0.59929,31.695565,0.198161,0.046788,0.158145,0.088785
5,0.7025,0.601369,32.130254,0.196728,0.045485,0.157735,0.089083
6,0.8332,0.602918,32.462749,0.19647,0.046863,0.159351,0.088738
7,0.625,0.603155,32.537368,0.195604,0.04653,0.158544,0.088621
8,0.7441,0.603916,32.69925,0.197835,0.048675,0.159589,0.094843
9,0.6888,0.604084,32.736294,0.196962,0.048443,0.158986,0.094679
10,0.7397,0.604046,32.730246,0.196316,0.047671,0.158542,0.093705


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=5000, training_loss=0.8254198089599609, metrics={'train_runtime': 897.1476, 'train_samples_per_second': 11.146, 'train_steps_per_second': 5.573, 'total_flos': 1306460160000000.0, 'train_loss': 0.8254198089599609, 'epoch': 10.0})

### Save model

### Generate Text

In [19]:
pipe = pipeline(
    "text-generation",
    model=model="./gpt2-finetuned",
    tokenizer=tokenizer,
    device=0
)

prompt = "Once upon a time in a distant galaxy"

outputs = pipe(
    prompt,
    max_length=50,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50
)

print("\nGenerated text:\n", outputs[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated text:
 Once upon a time in a distant galaxy, the light of these dying stars was passing through them, and then passing through the Milky Way. This was the last time a single star had ever come into existence, and the last time that a single one had been present in the Universe. The first stars that had been observed to interact with the sun were born there, and they had no visible presence. The first known's existence was in the form of a single's disc in the vicinity of the sun. The name of the planet's disc was given by the Greek god Dionysus, who described it as the " last remnant of the sun ". 

