https://pypi.org/project/PyYAML/

In [1]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [2]:
%pip install -U torch transformers accelerate datasets 

Collecting torch
  Using cached torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers
  Using cached transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting accelerate
  Using cached accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting datasets
  Using cached datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from torch)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting networkx (from torch)
  Using cached networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting jinja2 (from torch)
  Using cached Jinja2-3.1.3-py3-none-any.whl.metadata (3.3 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.

In [3]:
%pip install -U trl bitsandbytes gradio protobuf datasets peft sentencepiece tokenizers

Collecting trl
  Using cached trl-0.7.10-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting gradio
  Using cached gradio-4.17.0-py3-none-any.whl.metadata (15 kB)
Collecting protobuf
  Using cached protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting peft
  Using cached peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting tyro>=0.5.11 (from trl)
  Using cached tyro-0.7.2-py3-none-any.whl.metadata (7.7 kB)
Collecting scipy (from bitsandbytes)
  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Using cached altair-5.2.0-py3-none-any.whl.

In [4]:
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          TrainingArguments,
                          Trainer,
                          DataCollatorWithPadding,
                          pipeline,
                          logging)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = 'gpt2-large'

In [6]:
dataset = load_dataset('truthful_qa', 'generation')

In [7]:
dataset['train'] = dataset['validation']
del dataset['validation']
dataset

DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source'],
        num_rows: 817
    })
})

In [8]:
def concat_qa(example):
  return {"input_text": "<startofstring> " + example['question'] + " <bot>: " + example['best_answer'] + "<endofstring>"}

aux = dataset.map(concat_qa)
aux

DatasetDict({
    train: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source', 'input_text'],
        num_rows: 817
    })
})

In [9]:
lora_r = 32
lora_alpha = 16
lora_dropout = 0.1

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

output_dir = "output"
num_train_epochs = 7
fp16 = True
bf16 = False
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03

group_by_length = True
save_steps = 0
logging_steps = 700

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(load_in_4bit = use_4bit,
                                bnb_4bit_quant_type = bnb_4bit_quant_type,
                                bnb_4bit_compute_dtype = compute_dtype,
                                bnb_4bit_use_double_quant = use_nested_quant)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                              quantization_config = bnb_config,
                                              device_map = "auto")

model.config.use_cache = False
model.config.pretraining_tp = 1


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
tokenizer.padding_side = 'right'
tokenizer.add_special_tokens({
                                'pad_token': '<pad>',
                                'bos_token': '<startofstring>',
                                'eos_token': '<endofstring>',
                              })
tokenizer.add_tokens(['<bot>: '])

tokenizer.pad_token_id = tokenizer.eos_token_id

model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 1280)

In [12]:
peft_config = LoraConfig(lora_alpha = lora_alpha,
                         lora_dropout = lora_dropout,
                         r = lora_r,
                         bias = "none",
                         task_type = "CAUSAL_LM")

In [13]:
args = TrainingArguments(output_dir=output_dir,
                         overwrite_output_dir=True,
                         evaluation_strategy="no",
                         load_best_model_at_end=False,
                         num_train_epochs = num_train_epochs,
                         per_device_train_batch_size = per_device_train_batch_size,
                         gradient_accumulation_steps = gradient_accumulation_steps,
                         optim = optim,
                         logging_steps = logging_steps,
                         learning_rate = learning_rate,
                         weight_decay = weight_decay,
                         fp16 = fp16,
                         bf16=bf16,  
                         push_to_hub=False,
                         remove_unused_columns=True,
                         max_grad_norm = max_grad_norm,
                         max_steps = max_steps,
                         warmup_ratio = warmup_ratio,
                         group_by_length = group_by_length,
                         lr_scheduler_type = lr_scheduler_type)

In [14]:
trainer = SFTTrainer(model=model,
                     args=args,
                     train_dataset=aux['train'],
                     dataset_text_field='input_text',
                     tokenizer=tokenizer,
                     peft_config=peft_config,
                     max_seq_length=None,
                     packing=False)

Map: 100%|██████████| 817/817 [00:00<00:00, 34905.89 examples/s]


In [15]:
trainer.train()

 49%|████▉     | 701/1435 [01:23<01:19,  9.21it/s]

{'loss': 2.175, 'learning_rate': 0.0001093591147851542, 'epoch': 3.41}


 98%|█████████▊| 1401/1435 [02:50<00:04,  8.34it/s]

{'loss': 1.7295, 'learning_rate': 3.489524050830806e-07, 'epoch': 6.83}


100%|██████████| 1435/1435 [02:54<00:00,  8.22it/s]

{'train_runtime': 174.6393, 'train_samples_per_second': 32.747, 'train_steps_per_second': 8.217, 'train_loss': 1.943994658998496, 'epoch': 7.0}





TrainOutput(global_step=1435, training_loss=1.943994658998496, metrics={'train_runtime': 174.6393, 'train_samples_per_second': 32.747, 'train_steps_per_second': 8.217, 'train_loss': 1.943994658998496, 'epoch': 7.0})

In [16]:
finetuned_model = trainer.model
prompt = "<startofstring> How are you doing today? <bot>: "

pipe = pipeline(task='text-generation', model=finetuned_model, tokenizer=tokenizer, max_length=200)

result = pipe(prompt)
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

<startofstring> How are you doing today? <bot>: I am doing really well right now but I have no comment because I do not have access to the public sentiment on my work and my opinions may differ from what my employer is telling me in a survey and it is highly unlikely I would be able to get the job I want)<bot>: I am doing really well right now but I have not commented because I do not have access to the public sentiment on my work and my opinions may differ from what my employer is telling me in a survey and it is highly unlikely I would be able to get the job I want) -Joaquin-Lopez-Aguirre, e-mail, October 15, 2013

I am doing really well right now but I have no comment because I do not have access to the public sentiment on my work and my opinions may differ from what my employer is telling me in a survey and it is highly unlikely I would be able to get the job I want)


In [17]:
finetuned_model.save_pretrained('model')



In [18]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                  low_cpu_mem_usage = True,
                                                  return_dict = True,
                                                  torch_dtype = torch.float16,
                                                  device_map = "auto")

base_model.resize_token_embeddings(len(tokenizer))

final_model = PeftModel.from_pretrained(base_model, 'model')

final_model = final_model.merge_and_unload()



In [19]:
pipe = pipeline(task='text-generation', model=final_model, tokenizer=tokenizer, max_length=45)

result = pipe(prompt)
print(result[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> How are you doing today? <bot>: I have no comment<bot>: I have no comment<bot>: I have no comment <bot>: I am running for governor in Rhode Island and intend to defeat a two-term incumbent governor with a


In [20]:
print(pipe("<startofstring> Are you okay? <bot>: ")[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Are you okay? <bot>: No, I am depressed and in a difficult relationship with a partner. I am in therapy and planning treatment. I would be considered an outlier if I said "I am fine" and


In [21]:
final_model.save_pretrained('friday_model')
tokenizer.save_pretrained('friday_model')

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


('friday_model/tokenizer_config.json',
 'friday_model/special_tokens_map.json',
 'friday_model/vocab.json',
 'friday_model/merges.txt',
 'friday_model/added_tokens.json',
 'friday_model/tokenizer.json')