In [None]:
%pip install ipywidgets
%pip install datasets 
%pip install peft==0.4.0
!pip install transformers torch accelerate

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!huggingface-cli whoami

Ayush-1722


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

foundation_model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
%%time

input1 = tokenizer("Two things are infinite: ", return_tensors="pt")

foundation_outputs = foundation_model.generate(
    input_ids=input1["input_ids"], 
    attention_mask=input1["attention_mask"], 
    max_new_tokens=7, 
    eos_token_id=tokenizer.eos_token_id
    )
print(tokenizer.batch_decode(foundation_outputs, skip_special_tokens=True))

['Two things are infinite: 1. the universe, and ']
CPU times: user 7min 14s, sys: 430 ms, total: 7min 14s
Wall time: 33.9 s


In [7]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")

data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
train_sample = data["train"].select(range(50))
display(train_sample) 

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [8]:
from peft import  get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.RANDOM,
    num_virtual_tokens=4,
    tokenizer_name_or_path=model_name
)
peft_model = get_peft_model(foundation_model, peft_config)
print(peft_model.print_trainable_parameters())

trainable params: 16,384 || all params: 6,738,432,000 || trainable%: 0.00024314261834207127
None


In [9]:
from transformers import TrainingArguments
import os

output_directory = os.path.join("models", "peft_outputs_llama7b")

if not os.path.exists("models"):
    os.mkdir("models")
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

training_args = TrainingArguments(
    output_dir=output_directory, # Where the model predictions and checkpoints will be written
    no_cuda=True, # This is necessary for CPU clusters. 
    auto_find_batch_size=True, # Find a suitable batch size that will fit into memory automatically 
    learning_rate= 3e-2, # Higher learning rate than full fine-tuning
    num_train_epochs=5 # Number of passes to go through the entire fine-tuning dataset 
)

In [10]:
%%time

from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=peft_model, # We pass in the PEFT version of the foundation model, Llama2_7B
    args=training_args,
    train_dataset=train_sample,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) # mlm=False indicates not to use masked language modeling
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


CPU times: user 12h 35min 19s, sys: 5min 50s, total: 12h 41min 10s
Wall time: 47min 38s


TrainOutput(global_step=35, training_loss=1.1301363263811384, metrics={'train_runtime': 2858.1127, 'train_samples_per_second': 0.087, 'train_steps_per_second': 0.012, 'total_flos': 1427265509179392.0, 'train_loss': 1.1301363263811384, 'epoch': 5.0})

In [11]:
import time

time_now = time.time()
peft_model_path = os.path.join(output_directory, f"peft_model_{time_now}")
trainer.model.save_pretrained(peft_model_path)

In [12]:
from peft import PeftModel

loaded_model = PeftModel.from_pretrained(foundation_model, 
                                         peft_model_path, 
                                         is_trainable=False)

In [13]:
loaded_model_outputs = loaded_model.generate(
    input_ids=input1["input_ids"], 
    attention_mask=input1["attention_mask"], 
    max_new_tokens=7, 
    eos_token_id=tokenizer.eos_token_id
    )
print(tokenizer.batch_decode(loaded_model_outputs, skip_special_tokens=True))

['Two things are infinite: 1.\n\nthe universe and']


## Text initialization

Our fine-tuned, randomly initialized model did pretty well on the quote above. Let's now compare it with the text initialization method. 

Notice that all we are changing is the `prompt_tuning_init` setting and we are also providing a concise text prompt. 

API docs
* [prompt_tuning_init_text](https://huggingface.co/docs/peft/main/en/package_reference/tuners#peft.PromptTuningConfig.prompt_tuning_init_text)

In [14]:
text_peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Generate inspirational quotes", # this provides a starter for the model to start searching for the best embeddings
    num_virtual_tokens=3, # this doesn't have to match the length of the text above
    tokenizer_name_or_path=model_name
)
text_peft_model = get_peft_model(foundation_model, text_peft_config)
print(text_peft_model.print_trainable_parameters())

trainable params: 12,288 || all params: 6,738,427,904 || trainable%: 0.00018235707460349495
None


In [15]:
text_trainer = Trainer(
    model=text_peft_model,
    args=training_args,
    train_dataset=train_sample,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

text_trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


TrainOutput(global_step=35, training_loss=0.7284450531005859, metrics={'train_runtime': 608.892, 'train_samples_per_second': 0.411, 'train_steps_per_second': 0.057, 'total_flos': 1427265509179392.0, 'train_loss': 0.7284450531005859, 'epoch': 5.0})

In [17]:
%%time

# Save the model
time_now = time.time()
text_peft_model_path = os.path.join(output_directory, f"text_peft_model_{time_now}")
text_trainer.model.save_pretrained(text_peft_model_path)

# Load model 
loaded_text_model = PeftModel.from_pretrained(
    foundation_model, 
    text_peft_model_path, 
    is_trainable=False
)

# Generate output
text_outputs = text_peft_model.generate(
    input_ids=input1["input_ids"], 
    attention_mask=input1["attention_mask"], 
    max_new_tokens=7, 
    eos_token_id=tokenizer.eos_token_id
)
    
print(tokenizer.batch_decode(text_outputs, skip_special_tokens=True))

['Two things are infinite: 1. the universe and 2']
CPU times: user 1min 36s, sys: 25.7 ms, total: 1min 36s
Wall time: 6.95 s
