In [30]:
%pip install -U transformers datasets peft trl bitsandbytes --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "openai-community/gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/292 [00:00<?, ?it/s]



In [11]:
def chat_template(instruction: str, output: str | None = None) -> str:
    user_tag = "<|User|>"
    assistant_tag = "<|Assistant|>"
    prompt = f"{user_tag}\n{instruction}\n{assistant_tag}\n"

    if output:
        prompt += output
    return prompt

In [4]:
from pprint import pprint

text = "What is the president of the United States doing all day?"

pprint(chat_template(text))

('<|User|>\n'
 'What is the president of the United States doing all day?\n'
 '<|Assistant|>\n')


In [6]:
token_ids = tokenizer(chat_template(text), return_tensors="pt")
token_ids = {k: v.to(device) for k, v in token_ids.items()}
token_ids

{'input_ids': tensor([[   27,    91, 12982,    91,    29,   198,  2061,   318,   262,  1893,
            286,   262,  1578,  1829,  1804,   477,  1110,    30,   198,    27,
             91, 48902,    91,    29,   198]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1]], device='cuda:0')}

In [13]:
outputs = model.generate(**token_ids, max_new_tokens=50, repetition_penalty=1.2, do_sample=True, temperature=0.7)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [14]:
pprint(tokenizer.decode(outputs[0], skip_special_tokens=True))

('<|User|>\n'
 'What is the president of the United States doing all day?\n'
 '<|Assistant|>\n'
 "So when do we start seeing signs, like this one: 'We're not going to let you "
 "out until our troops are safe?' (or something similar). What does that mean "
 'in your opinion and how can everyone avoid it without being a danger '
 'yourself')


# Chat-Instruction Fine Tuning

In [3]:
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned", split="train", streaming=True)

In [6]:
def filter_empty_input(sample):
    return sample["input"] == ""

filtered_dataset = dataset.filter(filter_empty_input)

In [7]:
def _tokenize(sample):
    prompt = f"<|User|>\n{sample['instruction']}\n\n<|Assistant|>\n{sample['output']}{tokenizer.eos_token}"
    return tokenizer(prompt, truncation=True, max_length=512)

tokenized_stream = filtered_dataset.map(
    _tokenize,
    remove_columns=list(filtered_dataset.features)
)

In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./gpt2_sft-chat",
    max_steps=500,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    optim="adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_stream,
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.152925
20,2.0053
30,2.000811
40,1.921167
50,1.99145
60,1.906825
70,1.947467
80,1.882538
90,1.884331
100,1.83337


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=500, training_loss=1.8212048683166504, metrics={'train_runtime': 1086.8602, 'train_samples_per_second': 3.68, 'train_steps_per_second': 0.46, 'total_flos': 2014972149473280.0, 'train_loss': 1.8212048683166504, 'epoch': 1.0})

# Test

In [9]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [13]:
from pprint import pprint

text = "What is the president of the United States doing all day?"

prompt = chat_template(text)
token_ids = tokenizer(chat_template(text), return_tensors="pt")
token_ids = {k: v.to(device) for k, v in token_ids.items()}

outputs = model.generate(**token_ids, max_new_tokens=50, repetition_penalty=1.2, do_sample=True, temperature=0.7)
pprint(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


('<|User|>\n'
 'What is the president of the United States doing all day?\n'
 '<|Assistant|>\n'
 'The President works from the White House in the morning to conduct official '
 'business, including meetings with his Cabinet members and legislative '
 'representatives. He then begins working at the West Wing where he meets '
 'regularly for breakfast on the South Lawn before taking off to continue the '
 'day')


In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
repo_id = "CKeibel/gpt2-medium-chat"
tokenizer.push_to_hub(repo_id)
trainer.model.push_to_hub(repo_id)


README.md: 0.00B [00:00, ?B/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...z3q0ha9/model.safetensors:   0%|          |  549kB / 1.42GB            

CommitInfo(commit_url='https://huggingface.co/CKeibel/gpt2-medium-chat/commit/539685c5b65d244bd5ab825fd700b265e02dd415', commit_message='Upload model', commit_description='', oid='539685c5b65d244bd5ab825fd700b265e02dd415', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CKeibel/gpt2-medium-chat', endpoint='https://huggingface.co', repo_type='model', repo_id='CKeibel/gpt2-medium-chat'), pr_revision=None, pr_num=None)