# Prompt Tuning

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoModelForSequenceClassification
from peft import PromptTuningConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
# Load dataset
dataset = load_dataset("imdb")  # Example dataset; replace with your own classification dataset

train_dataset = dataset["train"].select(range(2000))
num_train = len(dataset["train"])
test_dataset = dataset["train"].select(range(num_train - 1000, num_train))


# Load LLaMA model and tokenizer
model_name = "/home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct"  # Replace with "meta-llama/Llama-2-1b-hf" or other LLaMA models
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",  # Automatically allocate to GPU/CPU
    load_in_8bit=False,  # Optional: Use 8-bit precision to save memory
)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Configure Prompt Tuning
prompt_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,  # Task type: causal language modeling (for LLaMA)
    num_virtual_tokens=20,        # Number of virtual tokens; adjust based on task complexity
    tokenizer_name_or_path=model_name,
)

# Apply PEFT configuration to the model
peft_model = get_peft_model(model, prompt_config)
peft_model.print_trainable_parameters()  # Print the number of trainable parameters

# Data preprocessing function
def preprocess_function(examples):
    inputs = examples["text"]
    labels = examples["label"] if "label" in examples else None

    # Tokenize the inputs
    tokenized_inputs = tokenizer(
        inputs,
        truncation=True,
        padding=True,
        max_length=1024,
        return_tensors="pt"
    )

    # Add labels to tokenized inputs
    if labels is not None:
        tokenized_inputs["labels"] = torch.tensor(labels)  # Ensure labels are tensors

    return tokenized_inputs

# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_strategy="steps",
    evaluation_strategy="steps",
    logging_steps=10,
    eval_steps=20,
    learning_rate=5e-7,  # Adjust learning rate based on task; higher rates often work for prompt tuning
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
)

# Define Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save the trained model and configuration
peft_model.save_pretrained("./peft_prompt_tuning_llama")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 45,056 || all params: 1,235,863,552 || trainable%: 0.0036


Map: 100%|██████████| 1000/1000 [00:00<00:00, 2177.76 examples/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
LlamaForSequenceClassification will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`


Step,Training Loss,Validation Loss
20,0.1798,2.367421
40,0.1535,2.424923
60,0.1685,2.478423
80,0.1077,2.526912
100,0.1147,2.57146
120,0.1155,2.610654
140,0.0846,2.644905
160,0.1264,2.673214
180,0.2044,2.696771
200,0.1149,2.715554
