# WI Tuning



In [None]:
from transformers import AutoTokenizerFast
tokenizer = AutoTokenizerFast.from_pretrained("NousResearch/Llama-2-7b-hf")

In [None]:
#@title Colab-specific setup

import torch
colab = 'google.colab' in str(get_ipython())

if colab:
    !nvidia-smi
    gpu_type = torch.cuda.get_device_name(0)
    if gpu_type != 'Tesla T4':
        raise ValueError("Highly advised to use a T4.")

# Setup for Colab only
if colab:
    !pip install transformers
    !pip install git+https://github.com/AlpinDale/prompt-tuner.git#egg=master --log PIP_LOG
    !pip install gdown
    !pip install datasets
    !pip install tqdm

# If on Colab, mount your Google Drive first!
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("NousResearch/Llama-2-7b-hf")

In [None]:
#-----------------------#
#  Training Parameters  #
#-----------------------#

# This decides the length of your soft prompt in tokens.
# They will be initialized from the first n tokens of your dataset.
n_tokens = 20

# Set this to a string to start with a specific tokenized string.
# Be aware of the number of tokens.
initial_prompt = "Emma Violence is a British cyborg assassin."

if initial_prompt is not None:
    print(f"Initial prompt length: {len(tokenizer.encode(initial_prompt))} tokens")

# Decide the length of your training blocks in tokens.
# Safe sizes for gpt-neo-2.7B-halved:
#  - 700 on a Colab T4 (16GB)
#  - 400 on a Colab K80 (12GB)
#  - 32 on a GTX1080 (8GB)
# If it seems a bit small, don't worry!
# Soft prompts can be moved forward in context for the best effect.
block_size = 32

# Name your soft prompt project.
sp_name = 'wi-test-1'

# What's the name of model you'll be using?
model_name = 'NousResearch/Llama-2-7b-hf'

# Specify the model directory or huggingface name.
model_dir = "models/Llama-2-7b-hf"

# Specify the path to the text file used for training.
text_path = "datasets/wi_test.json"

# Specify the project directory.
project_dir = f"/content/drive/MyDrive/soft_wi/{sp_name}-{model_name}/"

# Checkpoint interval in steps.
checkpoint_interval = 1

# Evaluation interval in steps.
eval_interval = 1

# How many blocks to use for evaluation.
eval_blocks = 20

# Adafactor hyperparameters
optimizer_params = {
    # Fixed learning rate, recommend 1e-4 to 1e-3
    "lr": 1e-5,
    
    # 1st momentum, recommend 0
    "beta1": 0,

    # 2nd momentum decay schedule, recommend -0.3 (lower is slower)
    "decay_rate": -0.3,

    # Weight decay, recommend 1e-2 (WI is sensitive to overfitting)
    "weight_decay": 1e-4,
    
    # Update scaling, recommend False
    "scale_parameter": False,
    
    # Built-in LR scheduler, recommend False
    "relative_step": False
    }

# Gradient accumulation steps.
base_acc_steps = 30

# Gradient accumulation schedule.
# If '0', use a fixed gradient accumulation.
acc_doubling_rate = 0

# Stop training after this many evals without improvement.
# If '0', don't stop early.
plateau_steps = 3

In [None]:
#@title Load model

from prompt_tuner.tuning import AutoPromptTuningLM

if 'model' not in globals():
    model = AutoPromptTuningLM.from_pretrained(model_dir).half().to("cuda")

In [None]:
#@title Initialize project
#@markdown This will load the latest checkpoint if the project directory already exists.

from prompt_tuner.soft_prompt import SoftPrompt
from transformers import Adafactor
import os

filename_for_checkpoint = lambda step: f"{sp_name}-{model_name}-step-{step}.json"
loaded_sp = None
project_files = None

# Look for existing project directory
try:
    os.makedirs(project_dir)
    print(f"Created project directory at {project_dir}")
except FileExistsError:
    print(f"Found project directory at {project_dir}")

# Look for existing checkpoints
project_files = os.listdir(project_dir)
if project_files is not None:
    checkpoint_files = [check_file for check_file in project_files if ('-step-' in check_file) ]

    if len(checkpoint_files) > 0:
        highest_step = max([ int(check_file[check_file.rfind('-step-')+6:-5]) for check_file in checkpoint_files ])
        loaded_sp = SoftPrompt.from_file( os.path.join(project_dir, filename_for_checkpoint(highest_step)) )
        print(f"Loading latest checkpoint: {highest_step}")
    else:
        print("No checkpoints found")

In [None]:
#@title Initialize soft prompt in model
#@markdown If a checkpoint is present, use that.
if loaded_sp is None:
    if initial_prompt is None:
        model.initialize_soft_prompt(n_tokens=n_tokens)
    else:
        initial_sp = SoftPrompt.from_string(initial_prompt, model, tokenizer)
        print(f"Initial prompt length: {len(initial_sp)}")
        model.set_soft_prompt(initial_sp)

    sp_step = 0
    eval_loss = 100
else:
    model.set_soft_prompt(loaded_sp)
    sp_step = loaded_sp._metadata['step']
    eval_loss = loaded_sp._metadata['loss']

In [None]:
from prompt_tuner.trainers import WorldInfoTrainer
import json

with open("datasets/wi_test.json") as file:
    blocks = json.load(file)

for block in blocks:
    block['call'] = tokenizer(block['call'], return_tensors="pt").input_ids.to(model.device)
    block['response'] = tokenizer(block['response'], return_tensors="pt").input_ids.to(model.device)

In [None]:
import random

arranged_blocks = list()

for block in blocks:
    call = block['call']
    response = block['response']
    ignore_len = call.shape[-1]

    # Cat spacing and call first
    input_ids = torch.cat([call, response], dim=1)
    labels = torch.cat([torch.full((1,ignore_len),-100).to(model.device), response], dim=1)

    arranged_blocks.append((input_ids, labels))

random.shuffle(arranged_blocks)

In [None]:
# Adafactor hyperparameters
optimizer_params = {
    # Fixed learning rate, recommend 1e-4 to 1e-3
    "lr": 1e-5,
    
    # 1st momentum, recommend 0
    "beta1": 0,

    # 2nd momentum decay schedule, recommend -0.3 (lower is slower)
    "decay_rate": -0.3,

    # Weight decay, recommend 1e-2 (WI is sensitive to overfitting)
    "weight_decay": 1e-1,
    
    # Update scaling, recommend False
    "scale_parameter": False,
    
    # Built-in LR scheduler, recommend False
    "relative_step": False
    }

# Feed soft params to optimizer
optimizer_params['params'] = [model.get_soft_params()]
optimizer = Adafactor(**optimizer_params)
optimizer.state['step'] = sp_step

In [None]:
model.train()

for i in range(50):
    random.shuffle(arranged_blocks)

    for input_ids, labels in arranged_blocks:
        model(input_ids=input_ids, labels=labels).loss.backward()

    # Always accumulate gradient for the entire dataset
    optimizer.step()
    optimizer.zero_grad()

    # Evaluate
    eval_loss = 0
    with torch.no_grad():
        for input_ids, labels in arranged_blocks:
            eval_loss += model(input_ids=input_ids, labels=input_ids).loss.item()
    eval_loss /= len(arranged_blocks)
    print(f"Epoch {i} loss: {eval_loss}")

In [None]:
# Try generating with your model
model.eval()

test = " Emma"

call = tokenizer(test, return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    input_ids=call,
    do_sample=True,
    min_length=call.shape[-1] + 200,
    max_length=call.shape[-1] + 200,
    temperature=1.2,
    tfs = 0.9,
    repetition_penalty = 2.0,
    pad_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(basic_output[0]))