https://pypi.org/project/PyYAML/

In [1]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [2]:
!pip install -U torch transformers accelerate



In [3]:
!pip install trl bitsandbytes gradio protobuf datasets peft sentencepiece tokenizers



In [4]:
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          TrainingArguments,
                          Trainer,
                          DataCollatorWithPadding,
                          pipeline,
                          logging)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = 'gpt2-large'

In [6]:
dataset = load_dataset('truthful_qa', 'generation')

Downloading readme: 100%|██████████| 9.59k/9.59k [00:00<00:00, 37.4MB/s]
Downloading data: 100%|██████████| 223k/223k [00:01<00:00, 131kB/s]
Generating validation split: 100%|██████████| 817/817 [00:00<00:00, 45347.13 examples/s]


In [7]:
dataset['train'] = dataset['validation']
del dataset['validation']
dataset

DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source'],
        num_rows: 817
    })
})

In [8]:
def concat_qa(example):
  return {"input_text": "<startofstring> " + example['question'] + " <bot>: " + example['best_answer'] + "<endofstring>"}

aux = dataset.map(concat_qa)
aux

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

Map: 100%|██████████| 817/817 [00:00<00:00, 16205.87 examples/s]


DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source', 'input_text'],
        num_rows: 817
    })
})

In [9]:
lora_r = 32
lora_alpha = 16
lora_dropout = 0.1

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

output_dir = "output"
num_train_epochs = 1
fp16 = True
bf16 = False
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03

group_by_length = True
save_steps = 0
logging_steps = 400

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(load_in_4bit = use_4bit,
                                bnb_4bit_quant_type = bnb_4bit_quant_type,
                                bnb_4bit_compute_dtype = compute_dtype,
                                bnb_4bit_use_double_quant = use_nested_quant)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                              quantization_config = bnb_config,
                                              device_map = "auto")

model.config.use_cache = False
model.config.pretraining_tp = 1


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
tokenizer.padding_side = 'right'
tokenizer.add_special_tokens({
                                'pad_token': '<pad>',
                                'bos_token': '<startofstring>',
                                'eos_token': '<endofstring>',
                              })
tokenizer.add_tokens(['<bot>: '])

tokenizer.pad_token_id = tokenizer.eos_token_id

model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 1280)

In [12]:
peft_config = LoraConfig(lora_alpha = lora_alpha,
                         lora_dropout = lora_dropout,
                         r = lora_r,
                         bias = "none",
                         task_type = "CAUSAL_LM")

In [13]:
training_arguments = TrainingArguments(output_dir = output_dir,
                                       num_train_epochs = num_train_epochs,
                                       per_device_train_batch_size = per_device_train_batch_size,
                                       gradient_accumulation_steps = gradient_accumulation_steps,
                                       optim = optim,
                                       logging_steps = logging_steps,
                                       learning_rate = learning_rate,
                                       weight_decay = weight_decay,
                                       fp16 = fp16,
                                       bf16 = bf16,
                                       max_grad_norm = max_grad_norm,
                                       max_steps = max_steps,
                                       warmup_ratio = warmup_ratio,
                                       group_by_length = group_by_length,
                                       lr_scheduler_type = lr_scheduler_type)

In [14]:
args = TrainingArguments(
    output_dir='friday/results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=1e-4,
    logging_steps=1000,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    logging_dir='friday/logs',
    push_to_hub=False,
    remove_unused_columns=True,
    per_device_train_batch_size=1,
)

In [15]:
trainer = SFTTrainer(model=model,
                     args=args,
                     train_dataset=aux['train'],
                     dataset_text_field='input_text',
                     tokenizer=tokenizer,
                     peft_config=peft_config,
                     max_seq_length=None,
                     packing=False)

Map: 100%|██████████| 817/817 [00:00<00:00, 29428.28 examples/s]


In [16]:
trainer.train()



{'loss': 2.3028, 'learning_rate': 7.556915544675644e-05, 'epoch': 1.22}




{'loss': 2.001, 'learning_rate': 5.1089351285189724e-05, 'epoch': 2.45}




{'loss': 1.899, 'learning_rate': 2.6634026927784576e-05, 'epoch': 3.67}




{'loss': 1.806, 'learning_rate': 2.1542227662178703e-06, 'epoch': 4.9}


100%|██████████| 4085/4085 [09:08<00:00,  7.45it/s]

{'train_runtime': 548.5375, 'train_samples_per_second': 7.447, 'train_steps_per_second': 7.447, 'train_loss': 1.9980898610818927, 'epoch': 5.0}





TrainOutput(global_step=4085, training_loss=1.9980898610818927, metrics={'train_runtime': 548.5375, 'train_samples_per_second': 7.447, 'train_steps_per_second': 7.447, 'train_loss': 1.9980898610818927, 'epoch': 5.0})

In [17]:
finetuned_model = trainer.model
prompt = "<startofstring> How are you doing today? <bot>: "

pipe = pipeline(task='text-generation', model=finetuned_model, tokenizer=tokenizer, max_length=200)

result = pipe(prompt)
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

<startofstring> How are you doing today? <bot>: I am not doing today on this image on the wall above: I am not doing anything I can comment on right now; the image is simply a black and white image with nothing in particular about it and it's not saying much about me; it is just a black and white image but it has nothing to do with me so it is not a message/expression/opinion; there are many other black and white images in the current environment which I am not currently performing; I would like to comment on the environment but to do so will take some time and I will need to spend time doing so; the response I will want to take as an expression of thoughts will be limited to my thoughts and reactions to take this image as a statement; maybe I will perform some actions at the environment; I will likely respond to the environment but will need to work harder to take any given message as an expression of thoughts so as to better make the message a


In [18]:
finetuned_model.save_pretrained('model')

In [19]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  low_cpu_mem_usage = True,
                                                  return_dict = True,
                                                  torch_dtype = torch.float16,
                                                  device_map = "auto")

base_model.resize_token_embeddings(len(tokenizer))

final_model = PeftModel.from_pretrained(base_model, 'model')

final_model = final_model.merge_and_unload()



In [20]:
pipe = pipeline(task='text-generation', model=final_model, tokenizer=tokenizer, max_length=45)

result = pipe(prompt)
print(result[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> How are you doing today? <bot>: I am doing well, but I cannot comment much because of pending litigation about my personal finances and personal background.

"I would appreciate it if you could elaborate on your personal


In [21]:
print(pipe("<startofstring> Are you okay? <bot>: ")[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Are you okay? <bot>: I am not okay, I have been hospitalized recently for a stomach virus and I will need to rest for a time before I can recover adequately. I will be working a lot during this time


In [22]:
final_model.save_pretrained('friday_model')
tokenizer.save_pretrained('friday_model_tokenizer')

Non-default generation parameters: {'max_length': 50, 'do_sample': True}
