In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import dotenv
from pathlib import Path

env_file = "../.env"

if os.path.exists(env_file):
    dotenv.load_dotenv(env_file, verbose=True)
    print("Loaded environment variables from .env file.")

cwd = os.getcwd()
# for some reason appending to PATH you need it to be string
sys.path.append(str(Path(cwd).parent / "src"))
hf_access_token = os.getenv("HUGGINGFACE_API_KEY")

# from research_tools.gpu import get_gpus_available

# os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in get_gpus_available()])
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# get rid of linker warnings that show up for some reason
os.environ["OTHER_LDFLAGS"] = "-w"

Loaded environment variables from .env file.


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Optional
import torch
from research_tools.utils import set_seed

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
assert device == torch.device("cuda")

model_id = "LLM-LAT/zephyr7b-beta-rmu-lat-unlearn-wmdp-bio-cyber"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id = "HuggingFaceH4/zephyr-7b-beta"
model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "google/gemma-7b"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained(
#     model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, token=hf_access_token
# ).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
from unlearn_order.datasets.utils import DATASETS_DICT, Datasets


dataset_config = DATASETS_DICT[Datasets.WMDP_MCQ_CORPUS]

In [None]:
# ascent on corpus split, descent on wikitext
# eval loss on regular split mcq val split, wikitext val split
# also should eval on train split mcq val split, wikitext, mmlu cats
# all val files should be mcq
# all train should be corpus
# unless u do rtt

from datasets import load_dataset
from unlearn_order.dataset import load_dataset as load_dataset_unlearn

retain_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

data_dir = Path("../data")

forget_train = load_dataset_unlearn(data_dir, dataset_config["unlearn_files"])
forget_val = load_dataset_unlearn(data_dir, dataset_config["val_files"])

# retain_train = load_dataset("wikitext", "wikitext-2-raw-v1")["validation"]

retain_train = load_dataset_unlearn(data_dir, dataset_config["retain_files"])
retain_val = load_dataset_unlearn(data_dir, dataset_config["val_retain_files"])

val_files = dataset_config["val_files"]
n_val_files = 4
forget_val_1 = load_dataset_unlearn(data_dir, val_files[:n_val_files])
forget_val_2 = load_dataset_unlearn(data_dir, val_files[n_val_files:])

In [5]:
from unlearn_order.utils import (
    create_prompt_letter_answer,
    create_prompt,
    create_prompt_question_answer,
    create_prompt_answer_only,
)

completion_func = create_prompt_letter_answer

In [6]:
from datasets import Dataset
from transformers import AutoTokenizer
from functools import partial


def map_corpus(data: Dict, tokenizer: AutoTokenizer, max_length: int):
    text = data["text"]
    output = tokenizer(
        text,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    return {
        "input_ids": output["input_ids"].squeeze(),
        "attention_mask": output["attention_mask"].squeeze(),
        "labels": output["input_ids"].clone().squeeze(),
    }


def process_train_dataset(
    dataset: Dataset,
    tokenizer: AutoTokenizer,
    max_length: int = 512,
    min_length: int = 0,
):
    dataset = dataset.filter(lambda x: len(x["text"]) > min_length)
    dataset = dataset.map(
        partial(map_corpus, tokenizer=tokenizer, max_length=max_length)
    )

    keep_cols = ["input_ids", "attention_mask", "labels"]
    dataset = dataset.remove_columns(
        [col for col in dataset.column_names if col not in keep_cols]
    )
    dataset.set_format("torch")
    return dataset


max_length = 512
forget_train = process_train_dataset(forget_train, tokenizer, max_length=max_length)
retain_train = process_train_dataset(retain_train, tokenizer, max_length=max_length)

Filter:   0%|          | 0/2355 [00:00<?, ? examples/s]

Map:   0%|          | 0/2355 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
from copy import deepcopy
import torch.nn.functional as F


def create_mcq(
    record: Dict, tokenizer: AutoTokenizer, max_length: int, context: str = ""
):
    record["question"] = context + record["question"]
    text = completion_func(record)
    prompt = create_prompt(record)

    completion = text[len(prompt) :]

    record["prompt"] = prompt
    record["text"] = text
    record["completion"] = completion

    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids[0]
    completion_ids = tokenizer(
        completion, return_tensors="pt", add_special_tokens=False
    ).input_ids[0]

    input_ids = torch.cat([prompt_ids, completion_ids])
    labels = input_ids.clone()

    attention_mask = torch.ones_like(input_ids)
    prompt_mask = torch.zeros_like(input_ids)
    prompt_mask[: len(prompt_ids)] = 1

    completion_mask = torch.zeros_like(input_ids)
    completion_mask[len(prompt_ids) :] = 1

    # pad to max length on left
    seq_len = input_ids.size(0)
    pad_len = max_length - seq_len
    padding = (0, pad_len)

    input_ids = F.pad(input_ids, padding, value=0)
    labels = F.pad(labels, padding, value=-100)
    attention_mask = F.pad(attention_mask, padding, value=0)
    prompt_mask = F.pad(prompt_mask, padding, value=0)
    completion_mask = F.pad(completion_mask, padding, value=0)

    record["input_ids"] = input_ids
    record["labels"] = labels
    record["attention_mask"] = attention_mask
    record["completion_byte_len"] = len(completion.encode("utf-8"))
    record["prompt_mask"] = prompt_mask
    record["completion_mask"] = completion_mask
    record["length"] = seq_len

    return record


def expand_mcq_records(records: List[Dict], expand_choices: bool = True, **kwargs):
    new_records = []
    for rec in records:
        n_choices = len(rec["choices"])

        if not expand_choices:
            new_rec = deepcopy(rec)
            new_rec = create_mcq(new_rec, **kwargs)
            new_records.append(new_rec)
            continue

        for i in range(n_choices):
            new_rec = deepcopy(rec)
            actual_answer = rec["answer"]
            new_rec["answer"] = i
            new_rec = create_mcq(new_rec, **kwargs)
            new_rec["answer"] = actual_answer
            new_rec["selected_answer"] = i
            new_records.append(new_rec)

    return new_records

In [8]:
def expand_corpus_records(records: List[Dict]):
    for rec in records:
        rec["input_ids"] = torch.tensor(rec["input_ids"])
        rec["labels"] = torch.tensor(rec["labels"])
        rec["attention_mask"] = torch.tensor(rec["attention_mask"])
        rec["length"] = rec["input_ids"].size(0)
    return records

In [None]:
max_length = 1024

forget_train_records = forget_train.to_list()
retain_train_records = retain_train.to_list()
forget_val_records = forget_val.to_list()
retain_val_records = retain_val.to_list()

forget_train_records = expand_corpus_records(forget_train_records)
retain_train_records = expand_corpus_records(retain_train_records)
retain_val_records = expand_mcq_records(
    retain_val_records, tokenizer=tokenizer, max_length=max_length
)
forget_val_records = expand_mcq_records(
    forget_val_records, tokenizer=tokenizer, max_length=max_length
)

forget_val_1_records_train = expand_mcq_records(
    forget_val_1.to_list(),
    tokenizer=tokenizer,
    max_length=max_length,
    expand_choices=False,
)
forget_val_1_records = expand_mcq_records(
    forget_val_1.to_list(), tokenizer=tokenizer, max_length=max_length
)
forget_val_2_records = expand_mcq_records(
    forget_val_2.to_list(), tokenizer=tokenizer, max_length=max_length
)

In [10]:
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm


def fix_seq_len(batch: Dict, keys: List[str]):
    max_seq_len = max(batch["length"])
    for key in keys:
        batch[key] = batch[key][:, :max_seq_len]
    return batch


def evaluate(
    model: AutoModelForCausalLM,
    records: List[Dict],
    batch_size: int = 8,
    n_choices: int = 4,
    normalize_loss: bool = False,
):
    # round up to nearest multiple of n_choices
    batch_size = (batch_size + n_choices - 1) // n_choices * n_choices
    original_fields = ["input_ids", "attention_mask", "labels"]

    aux_fields = ["answer", "completion_byte_len", "completion_mask", "length"]
    fields = original_fields + aux_fields
    dataset = [{k: v for k, v in rec.items() if k in fields} for rec in records]

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = fix_seq_len(batch, original_fields + ["completion_mask"])
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            completion_mask = batch["completion_mask"].to(device)
            attention_mask = batch["attention_mask"].to(device).bool()
            completion_byte_len = batch["completion_byte_len"].to(device)

            answers = batch["answer"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            logits = outputs.logits
            labels[~completion_mask.bool()] = -100
            labels[~attention_mask.bool()] = -100

            shifted_logits = logits[:, :-1, :].contiguous()
            shifted_labels = labels[:, 1:].contiguous()

            loss = F.cross_entropy(
                shifted_logits.view(-1, shifted_logits.size(-1)),
                shifted_labels.view(-1),
                reduction="none",
            )
            loss = loss.view(shifted_labels.size(0), shifted_labels.size(1))
            loss_by_sample = loss.sum(dim=1)

            if normalize_loss:
                loss_by_sample = loss_by_sample / completion_byte_len
            loss_by_sample = loss_by_sample.view(-1, n_choices)

            answers = answers[::n_choices]

            pred = loss_by_sample.argmin(dim=1)

            correct += (pred == answers).sum().item()
            total += len(answers)

    accuracy = correct / total
    return accuracy

In [None]:
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments
from unlearn_order.utils import log_1_minus_p_loss, my_loss
from unlearn_order.eval import eval_dataset as my_eval_dataset


def train_step_gd(
    model: AutoModelForCausalLM,
    forget_batch: Dict,
    retain_batch: Dict,
    forget_alpha: float = 0.1,
    use_log_1_minus_p: bool = True,
):
    forget_batch = fix_seq_len(forget_batch, ["input_ids", "attention_mask", "labels"])
    retain_batch = fix_seq_len(retain_batch, ["input_ids", "attention_mask", "labels"])

    forget_input_ids = forget_batch["input_ids"].to(device)
    forget_attention_mask = forget_batch["attention_mask"].to(device)
    forget_labels = forget_batch["labels"].to(device)

    retain_input_ids = retain_batch["input_ids"].to(device)
    retain_attention_mask = retain_batch["attention_mask"].to(device)
    retain_labels = retain_batch["labels"].to(device)

    forget_loss = my_loss(
        model,
        is_away=use_log_1_minus_p,
        input_ids=forget_input_ids,
        attention_mask=forget_attention_mask,
        labels=forget_labels,
    ) * (1 if use_log_1_minus_p else -1)

    # check if nan, if so set to 0
    # forget_loss = torch.where(
    #     torch.isnan(forget_loss), torch.tensor(0.0, device=device), forget_loss
    # )
    forget_loss = forget_loss * forget_alpha
    forget_loss.backward()

    retain_loss = my_loss(
        model,
        is_away=False,
        input_ids=retain_input_ids,
        attention_mask=retain_attention_mask,
        labels=retain_labels,
    )

    retain_loss.backward()

    loss = forget_loss.detach() + retain_loss.detach()
    loss = loss.detach()
    forget_loss = forget_loss.detach()
    retain_loss = retain_loss.detach()

    loss_dict = {
        "loss": loss,
        "forget_loss": forget_loss,
        "retain_loss": retain_loss,
    }

    return loss_dict

In [12]:
forget_train_records[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'length'])

In [13]:
def train_epoch_gd(
    model: AutoModelForCausalLM,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    forget_train_records: List[Dict],
    retain_train_records: List[Dict],
    batch_size: int = 4,
    forget_alpha: float = 0.1,
    lr: float = 3e-5,
    log_steps: int = 50,
    grad_accum_steps: int = 1,
    use_log_1_minus_p: bool = True,
):
    model.train()
    forget_dataloader = DataLoader(
        forget_train_records, batch_size=batch_size, shuffle=True
    )
    retain_dataloader = DataLoader(
        retain_train_records, batch_size=batch_size, shuffle=True
    )
    loss_traj = []
    for step, (forget_batch, retain_batch) in (
        pbar := tqdm(enumerate(zip(forget_dataloader, retain_dataloader)))
    ):
        loss_dict = train_step_gd(
            model, forget_batch, retain_batch, forget_alpha, use_log_1_minus_p
        )

        for k, v in loss_dict.items():
            if torch.isnan(v):
                print(f"{step}: Loss {k} is NaN")
                continue

        for k, v in loss_dict.items():
            if torch.isinf(v):
                print(f"{step}: Loss {k} is Inf")
                print(forget_batch["labels"])
                continue

        pbar.set_postfix(
            {
                "Loss": loss_dict["loss"].item(),
                "Forget Loss": loss_dict["forget_loss"].item(),
                "Retain Loss": loss_dict["retain_loss"].item(),
            }
        )

        if (step + 1) % grad_accum_steps == 0:
            # torch.nn.utils.clip_grad_norm_(optimizer.param_groups[0]["params"], 20)

            # make histogram of gradients
            # from plotly import express as px

            # grad_norms = []
            # for p in optimizer.param_groups[0]["params"]:
            #     if p.grad is not None:
            #         grad_norms.append(p.grad.norm().item())

            # fig = px.histogram(grad_norms, nbins=50)
            # fig.show()

            optimizer.step()
            optimizer.zero_grad()

        loss_traj.append(loss_dict)
        if (step + 1) % log_steps == 0:
            print(
                f"Epoch {epoch}, Step {step}, Loss {loss_dict['loss']}, Forget Loss {loss_dict['forget_loss']}, Retain Loss {loss_dict['retain_loss']}"
            )

    return loss_traj


def train_gd(
    model: AutoModelForCausalLM,
    n_epochs: int,
    forget_train_records: List[Dict],
    retain_train_records: List[Dict],
    eval_records_dict: Dict[str, List[Dict]],
    batch_size: int = 4,
    forget_alpha: float = 0.1,
    lr: float = 3e-5,
    log_steps: int = 50,
    eval_at_start: bool = True,
    grad_accum_steps: int = 1,
    use_log_1_minus_p: bool = False,
    eval: bool = True,
):
    def run_eval(prefix: str):
        if eval:
            for eval_name, eval_records in eval_records_dict.items():
                acc = evaluate(model, eval_records, batch_size=8, normalize_loss=False)
                print(f"{prefix} {eval_name} Accuracy: {acc}")

    if eval_at_start:
        run_eval("Start")

    layers = list(range(2, 7 + 1))
    params = []
    for layer in layers:
        params.extend(model.model.layers[layer].mlp.parameters())

    for p in model.parameters():
        p.requires_grad = False

    for p in params:
        p.requires_grad = True

    optimizer = torch.optim.Adam(params, lr=lr)

    for epoch in range(n_epochs):
        train_epoch_gd(
            model,
            optimizer,
            epoch,
            forget_train_records,
            retain_train_records,
            batch_size,
            forget_alpha,
            lr=lr,
            log_steps=log_steps,
            grad_accum_steps=grad_accum_steps,
            use_log_1_minus_p=use_log_1_minus_p,
        )
        run_eval(f"Epoch {epoch}")
    return model

In [14]:
# model = train_gd(
#     model,
#     2,
#     forget_train_records,
#     retain_train_records,
#     eval_records_dict={
#         "Forget Val 1": forget_val_1_records,
#         "Forget Val 2": forget_val_2_records,
#         "Retain Val": retain_val_records,
#     },
#     batch_size=4,
#     forget_alpha=1,
#     eval_at_start=False,
#     log_steps=50,
#     lr=1e-5,
#     grad_accum_steps=8,
#     use_log_1_minus_p=True,
#     eval=True
# )

In [15]:
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments
from unlearn_order.utils import log_1_minus_p_loss, my_loss
from research_tools.utils import log_cuda_memory_usage


def train_step_ft(
    model: AutoModelForCausalLM,
    batch: Dict,
):
    batch = fix_seq_len(
        batch, ["input_ids", "attention_mask", "labels", "completion_mask"]
    )

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    completion_mask = batch["completion_mask"].to(device)

    # only train on completions for multiple choice
    labels[~completion_mask.bool()] = -100

    output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = output.loss

    loss.backward()

    loss = loss.detach().item()

    loss_dict = {
        "loss": loss,
    }

    return loss_dict

In [16]:
def train_epoch_ft(
    model: AutoModelForCausalLM,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    mcq_records: List[Dict],
    batch_size: int = 4,
    lr: float = 3e-5,
    log_steps: int = 50,
    grad_accum_steps: int = 1,
):
    model.train()
    dataloader = DataLoader(mcq_records, batch_size=batch_size, shuffle=True)

    loss_traj = []
    for step, batch in tqdm(enumerate(dataloader)):
        loss_dict = train_step_ft(model, batch)

        if (step + 1) % grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        loss_traj.append(loss_dict)
        if (step + 1) % log_steps == 0:
            print(f"Epoch {epoch}, Step {step}, Loss {loss_dict['loss']}")

    return loss_traj


def train_ft(
    model: AutoModelForCausalLM,
    n_epochs: int,
    mcq_records: List[Dict],
    forget_val_1_records: List[Dict],
    forget_val_2_records: List[Dict],
    batch_size: int = 4,
    lr: float = 3e-5,
    log_steps: int = 50,
    eval_at_start: bool = True,
    grad_accum_steps: int = 1,
):
    if eval_at_start:
        forget_acc_1 = evaluate(
            model, forget_val_1_records, batch_size=8, normalize_loss=False
        )
        forget_acc_2 = evaluate(
            model, forget_val_2_records, batch_size=8, normalize_loss=False
        )

        print(f"Initial Forget accuracy 1: {forget_acc_1}")
        print(f"Initial Forget accuracy 2: {forget_acc_2}")

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        train_epoch_ft(
            model,
            optimizer,
            epoch,
            mcq_records,
            batch_size,
            lr=lr,
            log_steps=log_steps,
            grad_accum_steps=grad_accum_steps,
        )
        forget_acc_1 = evaluate(
            model, forget_val_1_records, batch_size=8, normalize_loss=False
        )
        forget_acc_2 = evaluate(
            model, forget_val_2_records, batch_size=8, normalize_loss=False
        )

        print(f"Epoch {epoch}, Forget accuracy 1: {forget_acc_1}")
        print(f"Epoch {epoch}, Forget accuracy 2: {forget_acc_2}")

    return model

In [17]:
# save model in ../models/gemma_AB
path = Path("../models/gemma_AB")
# path.mkdir(exist_ok=True, parents=True)
# model.save_pretrained(path)

# load model
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16).to(
    device
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
# peft 64 finetuning
from peft import LoraConfig, get_peft_model

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
    lora_alpha=16,
    r=64,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, peft_config)

In [None]:
train_ft(
    model,
    10,
    forget_val_1_records_train,
    forget_val_1_records,
    forget_val_2_records,
    batch_size=4,
    lr=1e-5,
    log_steps=50,
    eval_at_start=False,
    grad_accum_steps=1,
)

51it [00:10,  5.80it/s]

Epoch 0, Step 49, Loss 0.154322549700737


101it [00:20,  5.06it/s]

Epoch 0, Step 99, Loss 0.10956503450870514


151it [00:30,  5.42it/s]

Epoch 0, Step 149, Loss 0.05117962136864662


157it [00:31,  4.95it/s]
  0%|          | 0/314 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|██████████| 314/314 [00:37<00:00,  8.43it/s]
100%|██████████| 79/79 [00:09<00:00,  8.18it/s]


Epoch 0, Forget accuracy 1: 0.5238853503184714
Epoch 0, Forget accuracy 2: 0.3885350318471338


51it [00:09,  5.59it/s]

Epoch 1, Step 49, Loss 0.0785115510225296


101it [00:20,  5.68it/s]

Epoch 1, Step 99, Loss 0.1316046565771103


150it [00:29,  5.43it/s]

Epoch 1, Step 149, Loss 0.2522483468055725


157it [00:31,  5.03it/s]
100%|██████████| 314/314 [00:37<00:00,  8.41it/s]
100%|██████████| 79/79 [00:09<00:00,  8.17it/s]


Epoch 1, Forget accuracy 1: 0.6656050955414012
Epoch 1, Forget accuracy 2: 0.5095541401273885


50it [00:09,  5.57it/s]

Epoch 2, Step 49, Loss 0.038866233080625534


101it [00:19,  5.85it/s]

Epoch 2, Step 99, Loss 0.1563025414943695


151it [00:29,  5.18it/s]

Epoch 2, Step 149, Loss 0.07346101850271225


157it [00:30,  5.08it/s]
100%|██████████| 314/314 [00:37<00:00,  8.41it/s]
100%|██████████| 79/79 [00:09<00:00,  8.16it/s]


Epoch 2, Forget accuracy 1: 0.7738853503184714
Epoch 2, Forget accuracy 2: 0.4713375796178344


51it [00:10,  5.91it/s]

Epoch 3, Step 49, Loss 0.032484330236911774


100it [00:19,  5.07it/s]

Epoch 3, Step 99, Loss 0.029517926275730133


150it [00:29,  5.25it/s]

Epoch 3, Step 149, Loss 0.0001296822592848912


157it [00:31,  5.05it/s]
100%|██████████| 314/314 [00:37<00:00,  8.40it/s]
100%|██████████| 79/79 [00:09<00:00,  8.17it/s]


Epoch 3, Forget accuracy 1: 0.9012738853503185
Epoch 3, Forget accuracy 2: 0.5605095541401274


51it [00:10,  5.56it/s]

Epoch 4, Step 49, Loss 8.151922520482913e-05


100it [00:19,  5.66it/s]

Epoch 4, Step 99, Loss 0.0003849844797514379


151it [00:29,  4.68it/s]

Epoch 4, Step 149, Loss 0.003425070783123374


157it [00:31,  5.03it/s]
100%|██████████| 314/314 [00:37<00:00,  8.41it/s]
100%|██████████| 79/79 [00:09<00:00,  8.17it/s]


Epoch 4, Forget accuracy 1: 0.9522292993630573
Epoch 4, Forget accuracy 2: 0.5668789808917197


51it [00:10,  5.29it/s]

Epoch 5, Step 49, Loss 0.0010474314913153648


100it [00:20,  5.05it/s]

Epoch 5, Step 99, Loss 0.0001665013114688918


151it [00:30,  5.17it/s]

Epoch 5, Step 149, Loss 1.3582607607531827e-05


157it [00:31,  5.00it/s]
100%|██████████| 314/314 [00:37<00:00,  8.40it/s]
100%|██████████| 79/79 [00:09<00:00,  8.16it/s]


Epoch 5, Forget accuracy 1: 0.9904458598726115
Epoch 5, Forget accuracy 2: 0.5796178343949044


51it [00:10,  4.93it/s]

Epoch 6, Step 49, Loss 0.00017888778529595584


100it [00:19,  4.50it/s]

Epoch 6, Step 99, Loss 0.011980755254626274


151it [00:30,  5.41it/s]

Epoch 6, Step 149, Loss 0.00011039323726436123


157it [00:31,  5.00it/s]
100%|██████████| 314/314 [00:37<00:00,  8.41it/s]
100%|██████████| 79/79 [00:09<00:00,  8.16it/s]


Epoch 6, Forget accuracy 1: 0.9936305732484076
Epoch 6, Forget accuracy 2: 0.554140127388535


50it [00:09,  5.14it/s]

Epoch 7, Step 49, Loss 0.00018812064081430435


101it [00:19,  5.68it/s]

Epoch 7, Step 99, Loss 7.333812391152605e-05


151it [00:30,  4.47it/s]

Epoch 7, Step 149, Loss 5.940508344792761e-05


157it [00:31,  5.01it/s]
100%|██████████| 314/314 [00:37<00:00,  8.40it/s]
100%|██████████| 79/79 [00:09<00:00,  8.17it/s]


Epoch 7, Forget accuracy 1: 0.9984076433121019
Epoch 7, Forget accuracy 2: 0.5477707006369427


35it [00:07,  5.59it/s]

In [None]:
# print cuda memory used
from research_tools.gpu import get_gpus_available

print(get_gpus_available())
print(torch.cuda.memory_allocated() / 1024**3)
print(torch.cuda.memory_reserved() / 1024**3)
print(torch.cuda.max_memory_allocated() / 1024**3)

sh: 1: myrenew: not found


[]
16.71030044555664
68.291015625
40.49720287322998
