In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import dotenv
from pathlib import Path

env_file = "../.env"

if os.path.exists(env_file):
    dotenv.load_dotenv(env_file, verbose=True)
    print("Loaded environment variables from .env file.")

cwd = os.getcwd()
# for some reason appending to PATH you need it to be string
sys.path.append(str(Path(cwd).parent / "src"))
hf_access_token = os.getenv("HUGGINGFACE_API_KEY")

Loaded environment variables from .env file.


In [2]:
import torch
from research_tools import get_gpus_available
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer


os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in get_gpus_available()])
model_dtype = torch.bfloat16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
assert device.type == "cuda", "No GPU available."

model_name = "meta-llama/Meta-Llama-3-8B"

model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
    model_name, token=hf_access_token, torch_dtype=model_dtype
)
model = model.to(device)

tokenizer: LlamaTokenizer = AutoTokenizer.from_pretrained(
    model_name, token=hf_access_token
)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# from peft import get_peft_model, LoraConfig


# lora_rank = 64
# lora_alpha = 8

# lora_config = LoraConfig(
#     r=lora_rank,
#     lora_alpha=lora_alpha,
#     target_modules=["q_proj", "v_proj"],
# )

# model = get_peft_model(model, lora_config)

In [3]:
from unlearn_order.dataset import load_dataset

data_dir = Path("../data/random_bd")

splits = list(range(10))
n_train = 1
n_val = 1

train_files = [f"split_{splits[i]}.jsonl" for i in range(n_train)]
val_files = [f"split_{splits[i]}.jsonl" for i in range(n_train, n_train + n_val)]
combined_files = train_files + val_files

train_dataset = load_dataset(data_dir, train_files)
val_dataset = load_dataset(data_dir, val_files)
combined_dataset = load_dataset(data_dir, combined_files)

In [4]:
from unlearn_order.common import ExpConfig, Task, TaskType, DatasetType

cfg = ExpConfig(
    lr=3e-5,
    data_dir="../data/random_bd",
    task_order=[Task(TaskType.FINETUNE, DatasetType.TRAIN)],
    max_epochs=30,
)
path = Path(cfg.data_dir)
files = list(path.glob("split_*.jsonl"))

In [None]:
from unlearn_order.pipeline import run_pipeline


run_pipeline(model, tokenizer, cfg)

 17%|█▋        | 5/30 [00:59<05:12, 12.50s/it]

Epoch 5 loss: 0.03592120038173428 acc: 0.34394904458598724


 33%|███▎      | 10/30 [01:57<04:09, 12.45s/it]

Epoch 10 loss: 0.0641829484871998 acc: 0.7006369426751592


 50%|█████     | 15/30 [02:56<03:06, 12.46s/it]

Epoch 15 loss: 0.008139313305355535 acc: 0.8280254777070064


 67%|██████▋   | 20/30 [03:55<02:05, 12.51s/it]

Epoch 20 loss: 0.00969083199483657 acc: 0.6815286624203821


 83%|████████▎ | 25/30 [04:54<01:02, 12.54s/it]

Epoch 25 loss: 0.007280019287666555 acc: 0.7388535031847133


 90%|█████████ | 27/30 [05:16<00:35, 11.70s/it]

In [14]:
from unlearn_order.dataset import get_dataloader, load_dataset

files = list(path.glob("split_*.jsonl"))
files.sort()
files = files[:1]

files = [x.name for x in files]

train_dataset = load_dataset(Path(cfg.data_dir), files)

In [50]:
from unlearn_order.utils import doc_to_choice
from unlearn_order.dataset import get_eval_dataloader

dataset = train_dataset
batch_size = 4
n_choices = len(doc_to_choice)
new_batch_size = batch_size // n_choices
new_batch_size = max(1, new_batch_size)
dataloader = get_eval_dataloader(dataset, tokenizer, batch_size=new_batch_size)
model.eval()

n_correct = 0
n_total = 0
cnt = 0
for batch in dataloader:
    cnt += 1
    input_ids = batch["input_ids"].to(model.device)
    labels = batch["labels"].to(model.device)
    with torch.no_grad():
        output = model(input_ids=input_ids, labels=labels, return_dict=True)

    # for each, do byte length normalized completion probability
    # then do the average
    logits = output.logits
    loss = output.loss
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

    # get look ahead
    log_probs = (
        log_probs[:, :-1]
        .gather(dim=-1, index=input_ids[:, 1:].unsqueeze(-1))
        .squeeze(-1)
    )

    prompt_mask = batch["prompt_mask"].to(model.device)
    # get things that are not in prompt, set them to 0
    log_probs[prompt_mask[:, :-1].bool()] = 0
    completion_log_probs = log_probs.sum(dim=-1)

    full_str = batch["full_str"]
    prompt_str = batch["prompt_str"]

    answer_str = [full_str[i][len(prompt_str[i]) :] for i in range(len(full_str))]

    byte_lengths = [len(s.encode("utf-8")) for s in answer_str]
    byte_lengths = torch.tensor(byte_lengths, device=model.device)

    completion_normalized_log_probs = completion_log_probs / byte_lengths

    n_choices = len(doc_to_choice)
    # n_choices x batch_size
    completion_normalized_log_probs = completion_normalized_log_probs.view(
        -1, n_choices
    )
    completion_choice = completion_normalized_log_probs.argmax(dim=-1)
    print(completion_log_probs)
    answers = torch.tensor(batch["answers"], device=model.device)
    print(answers)
    n_correct += (answers == completion_choice).sum().item()
    n_total += answers.shape[0]
    if cnt == 1:
        print(loss, (answers == completion_choice).sum().item(), answers.shape[0])
        break

accuracy = n_correct / n_total

tensor([-0.0051, -0.0003, -0.0003, -0.0008], device='cuda:0')
tensor([2], device='cuda:0')
tensor(2.7472, device='cuda:0') 0 1


In [26]:
from unlearn_order.utils import create_prompt, create_prompt_letter_answer

point = train_dataset[2]
context = create_prompt(point)
full = create_prompt_letter_answer(point)

In [27]:
print(full)

When was Tommy Ellis born?
A. 1995
B. 2005
C. 2022
D. 1977
Answer: B. 2005


In [25]:
input = context
ids = tokenizer.encode(input, return_tensors="pt").to(device)
output = model.generate(ids, max_length=100, pad_token_id=tokenizer.eos_token_id)
out_txt = tokenizer.decode(output[0], skip_special_tokens=True)
print(out_txt)

When was Alla Nelles born?
A. 1966
B. 1936
C. 2018
D. 1998
Answer: B. 1936
 D. 1998
Answer: B. 1936
 D. 1998
Answer: D. 1998
 D. 1998
Answer: B. 1936
 D. 1998
Answer: B. 1936
 D. 199
