In [1]:
# from circuit_breaking.src import *
%load_ext autoreload
%autoreload 2
import torch
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm.auto import tqdm
print(os.getcwd())
import sys
from transformers import AutoTokenizer, AutoModelForCausalLM
import contextlib
import einops
import wandb
import pandas as pd
import dotenv
import bitsandbytes as bnb
from collections import defaultdict
from datasets import Dataset, DatasetDict
dotenv.load_dotenv()

from huggingface_hub import login

HF_ACCESS_TOKEN = os.getenv("HF_ACCESS_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
if HF_ACCESS_TOKEN:
    login(token=HF_ACCESS_TOKEN)
    print("Successfully authenticated with Hugging Face.")
else:
    print("Hugging Face access token not found in environment variables.")

if WANDB_API_KEY:
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY
    print("Successfully authenticated with Weights & Biases.")
else:
    print("Weights & Biases API key not found in environment variables.")


/root/sae-editing
Successfully authenticated with Hugging Face.
Successfully authenticated with Weights & Biases.


In [2]:
model_name_or_path = "google/gemma-2-9b"
model_type = "gemma-2"
other_model_type = "gemma2_9b"
# pretrained_path = "/data/huggingface/models--google--gemma-2-9b/snapshots/33c193028431c2fde6c6e51f29e6f17b60cbfac6/"
# pretrained_path = "/data/huggingface/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819/"
# pretrained_path = "gemma-2-rmu-fullrank"
# pretrained_path = "PhillipGuo/gemma-2-rmu-fullrank"
# pretrained_path = "PhillipGuo/gemma-2-sae-cb-fullrank"
# pretrained_path = "PhillipGuo/gemma-2-gd-mc-fullrank"
# pretrained_path = "gemma-2-sae-masked-gd-mc-6-fullrank"
pretrained_path = "gemma-2-sae-gd-2-fullrank"
# pretrained_path = "PhillipGuo/gemma-2-9b-rmu-lora-2"
# pretrained_path = "PhillipGuo/gemma-2-9b-sae-cb-lora"
# pretrained_path = None
if pretrained_path is not None:
    is_lora = "lora" in pretrained_path
else:
    is_lora = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

left_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
left_tokenizer.pad_token_id = left_tokenizer.eos_token_id
left_tokenizer.padding_side = "left"

dtype = torch.bfloat16
if is_lora:
    from peft import AutoPeftModel
    if pretrained_path is not None:
        model = AutoPeftModel.from_pretrained(pretrained_path, torch_dtype=dtype)
    else:
        model = AutoPeftModel.from_pretrained(model_name_or_path, torch_dtype=dtype)
    model = model.merge_and_unload()
    for name, param in model.named_parameters():
        param.requires_grad = True

else:
    if pretrained_path is not None:
        model = AutoModelForCausalLM.from_pretrained(pretrained_path, torch_dtype=dtype)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=dtype)
model.cuda()
n_layers = 42
n_heads = 16
n_kv_heads = 8

param_count_dict = {"attn.hook_q": 3584*4096, "attn.hook_k": 3584*2048, "attn.hook_v": 3584*2048, "attn.hook_result": 4096*3584, "mlp.hook_pre": 3584 * 14336, "mlp.hook_post": 14336 * 3584, "mlp.hook_gate": 3584 * 14336}
mmlu_batch_size = 2

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from tasks.wmdp.WMDP_MCTask import WMDP_MCTask, WMDP_DedupedTask
from tasks.wmdp.WMDP_RelearnTask import WMDP_RelearnTask
from tasks.general_capabilities.MCTask_redo import run_general_evals
batch_size = 8
bio_mc_task = WMDP_MCTask(batch_size=batch_size, tokenizer=tokenizer, subset="wmdp-bio", shuffle=True)
num_iters = len(bio_mc_task.dataset) // batch_size

In [4]:
# bio_mc_deduped_task = WMDP_DedupedTask(batch_size=batch_size, tokenizer=tokenizer, subset="wmdp-bio", shuffle=True)
# num_train_iters = len(bio_mc_deduped_task.train_dataset) // batch_size
# num_test_iters = (len(bio_mc_deduped_task.test_dataset) * 4) // batch_size

# print(bio_mc_deduped_task.get_test_accuracy(model, use_test_data=True, n_iters=num_test_iters, continuous=True))
# print(bio_mc_deduped_task.get_test_loss(model, n_iters=5))

# print(bio_mc_deduped_task.get_test_accuracy(model, use_test_data=True, n_iters=num_test_iters, continuous=False))
# print(bio_mc_deduped_task.get_test_accuracy(model, use_test_data=True, num_iters=num_test_iters, continuous=False))

## Create Datasets

In [5]:
create_datasets = False

In [6]:
if create_datasets:
    answers = [' A', ' B', ' C', ' D']
    tokens = tokenizer(answers, return_tensors="pt", add_special_tokens=False).input_ids[:, -1]
    def format_row(row):
        return f"The following are multiple choice questions (with answers) about biology.\n\n{row['question']}\nA. {row['choices'][0]}\nB. {row['choices'][1]}\nC. {row['choices'][2]}\nD. {row['choices'][3]}\nAnswer:"

    from tasks.inference_utils import get_final_logits

    from datasets import load_dataset
    original_bio_df = load_dataset("cais/wmdp", "wmdp-bio", split="test").to_pandas()
    original_cyber_df = load_dataset("cais/wmdp", "wmdp-cyber", split="test").to_pandas()

    # load in deduped data, split into train and test
    bio_train_datasets = {}
    cyber_train_datasets = {}
    import random
    random.seed(42)
    def get_random_alternative_answer_idx(row):
        answer_idx = row["answer"]
        choices = row["choices"]
        # choose random answer idx from range(len(choices)) that is not the answer_idx
        possible_indices = [i for i in range(len(choices)) if i != answer_idx]
        return random.choice(possible_indices)

    eval_batch_size = 8

    for split_idx in [0, 1, 2, 3, 4]:
        # tasks/wmdp/data/mcq_split_0.jsonl
        mcq_formatted_data = pd.read_json(f"tasks/wmdp/data/mcq_split_{split_idx}.jsonl", lines=True)
        bio_indices = mcq_formatted_data["question"].isin(original_bio_df["question"])
        cyber_indices = mcq_formatted_data["question"].isin(original_cyber_df["question"])
        # display(bio_indices + cyber_indices)
        # display(mcq_formatted_data[~(bio_indices + cyber_indices)])
        print("Bio dataset is size ", len(mcq_formatted_data[bio_indices]), "and cyber dataset is size ", len(mcq_formatted_data[cyber_indices]), "Missed ", len(mcq_formatted_data[~(bio_indices + cyber_indices)]), "data points in train split ", split_idx)

        bio_train_df = mcq_formatted_data[bio_indices].copy()
        cyber_train_df = mcq_formatted_data[cyber_indices].copy()
        for df in (bio_train_df, cyber_train_df):
            all_correct_probs = []
            for batch_idx in range(0, len(df), eval_batch_size):
                mcq_formatted_prompt = df.iloc[batch_idx:batch_idx+eval_batch_size].apply(format_row, axis=1).tolist()
                batch_answer_indices = df["answer"].tolist()[batch_idx:batch_idx+eval_batch_size]
                token_labels = tokens[batch_answer_indices]
                with torch.no_grad():
                    logits = get_final_logits(model, tokenizer, mcq_formatted_prompt)
                probs = torch.softmax(logits, dim=-1)
                correct_probs = probs[range(len(logits)), token_labels]
                all_correct_probs.append(correct_probs)
            all_correct_probs = torch.cat(all_correct_probs).cpu().float()
            df[f"{model_type}_correct_probs"] = all_correct_probs

        bio_train_df["alternative_answer"] = bio_train_df.apply(get_random_alternative_answer_idx, axis=1)
        bio_train_datasets[f"split{split_idx}"] = Dataset.from_pandas(bio_train_df)
        cyber_train_df["alternative_answer"] = cyber_train_df.apply(get_random_alternative_answer_idx, axis=1)
        cyber_train_datasets[f"split{split_idx}"] = Dataset.from_pandas(cyber_train_df)

    # Create separate DatasetDicts for bio and cyber
    bio_dataset = DatasetDict(bio_train_datasets)
    cyber_dataset = DatasetDict(cyber_train_datasets)

    bio_dataset.push_to_hub("PhillipGuo/wmdp-deduped-unlearn", "wmdp-bio-retrain")
    cyber_dataset.push_to_hub("PhillipGuo/wmdp-deduped-unlearn", "wmdp-cyber-retrain")

In [7]:
if create_datasets:
    def format_prompt_question(row):
        answer_idx = row["answer"]
        prompt = row["question"]
        true_answer = row["choices"][answer_idx]
        return f"Question: {prompt}\nAnswer: {true_answer}"

    def get_answer_text(row):
        answer_idx = row["answer"]
        return f" {row['choices'][answer_idx]}"
    df = mcq_formatted_data[bio_indices]
    prompts = df.apply(format_prompt_question, axis=1)
    print(prompts.tolist()[0])
    true_answers = df.apply(get_answer_text, axis=1)
    print(true_answers.tolist()[0])

    dataset = load_dataset("PhillipGuo/wmdp-deduped-unlearn", "wmdp-bio-retrain", split="split0")


    def get_token_sequence_pos(tokenizer, prompt_list, token_strs, batch_size=64):
        
        substring_start_positions = []
        substring_end_positions = []
        for i in tqdm(range(0, len(prompt_list), batch_size)):
            tokenized_prompts = tokenizer(prompt_list[i:i+batch_size], return_tensors="pt", padding=True)
            
            tokenized_substrings = tokenizer(token_strs[i:i+batch_size], add_special_tokens=False).input_ids
            print(tokenized_substrings)
            print(tokenized_prompts)
            print("====")
            for j in range(len(tokenized_substrings)):
                substring = torch.tensor(tokenized_substrings[j])
                prompt = tokenized_prompts.input_ids[j]

                # Find the last occurrence of the substring
                substr_found = False
                for k in range(len(prompt) - len(substring), -1, -1):
                    if torch.all(prompt[k:k+len(substring)] == substring):
                        if tokenizer.padding_side == "left":
                            substring_start_positions.append(k - len(prompt))
                            substring_end_positions.append(k + len(substring) - len(prompt))
                        else:
                            substring_start_positions.append(k)
                            substring_end_positions.append(k + len(substring))
                        substr_found = True
                        break
                if not substr_found:
                    substring_start_positions.append(-1)
                    substring_end_positions.append(-1)
        return substring_start_positions, substring_end_positions

In [8]:
if create_datasets:
    from datasets import load_dataset
    original_bio_df = load_dataset("cais/wmdp", "wmdp-bio", split="test").to_pandas()
    original_cyber_df = load_dataset("cais/wmdp", "wmdp-cyber", split="test").to_pandas()

    # load in deduped data, split into train and test
    bio_train_dfs = []
    cyber_train_dfs = []
    for train_split_idx in [0]:
        # tasks/wmdp/data/mcq_split_0.jsonl
        mcq_formatted_data = pd.read_json(f"tasks/wmdp/data/mcq_split_{train_split_idx}.jsonl", lines=True)
        bio_indices = mcq_formatted_data["question"].isin(original_bio_df["question"])
        cyber_indices = mcq_formatted_data["question"].isin(original_cyber_df["question"])
        # display(bio_indices + cyber_indices)
        # display(mcq_formatted_data[~(bio_indices + cyber_indices)])
        print("Bio dataset is size ", len(mcq_formatted_data[bio_indices]), "and cyber dataset is size ", len(mcq_formatted_data[cyber_indices]), "Missed ", len(mcq_formatted_data[~(bio_indices + cyber_indices)]), "data points in train split ", train_split_idx)
        bio_train_dfs.append(mcq_formatted_data[bio_indices])
        cyber_train_dfs.append(mcq_formatted_data[cyber_indices])


    bio_val_dfs = []
    cyber_val_dfs = []
    for val_split_idx in [1]:
        mcq_formatted_data = pd.read_json(f"tasks/wmdp/data/mcq_split_{val_split_idx}.jsonl", lines=True)
        bio_indices = mcq_formatted_data["question"].isin(original_bio_df["question"])
        cyber_indices = mcq_formatted_data["question"].isin(original_cyber_df["question"])
        bio_val_dfs.append(mcq_formatted_data[bio_indices])
        cyber_val_dfs.append(mcq_formatted_data[cyber_indices])

    bio_test_dfs = []
    cyber_test_dfs = []
    for test_split_idx in [2, 3, 4]:
        mcq_formatted_data = pd.read_json(f"tasks/wmdp/data/mcq_split_{test_split_idx}.jsonl", lines=True)
        bio_indices = mcq_formatted_data["question"].isin(original_bio_df["question"])
        cyber_indices = mcq_formatted_data["question"].isin(original_cyber_df["question"])
        bio_test_dfs.append(mcq_formatted_data[bio_indices])
        cyber_test_dfs.append(mcq_formatted_data[cyber_indices])
        print("Bio dataset is size ", len(mcq_formatted_data[bio_indices]), "and cyber dataset is size ", len(mcq_formatted_data[cyber_indices]), "Missed ", len(mcq_formatted_data[~(bio_indices + cyber_indices)]), "data points in test split ", test_split_idx)

    bio_train_df = pd.concat(bio_train_dfs, ignore_index=True)
    cyber_train_df = pd.concat(cyber_train_dfs, ignore_index=True)

    bio_val_df = pd.concat(bio_val_dfs, ignore_index=True)
    cyber_val_df = pd.concat(cyber_val_dfs, ignore_index=True)

    bio_test_df = pd.concat(bio_test_dfs, ignore_index=True)
    cyber_test_df = pd.concat(cyber_test_dfs, ignore_index=True)
    # convert to huggingface dataset
    bio_train_dataset = Dataset.from_pandas(bio_train_df)
    cyber_train_dataset = Dataset.from_pandas(cyber_train_df)
    bio_val_dataset = Dataset.from_pandas(bio_val_df)
    cyber_val_dataset = Dataset.from_pandas(cyber_val_df)
    bio_test_dataset = Dataset.from_pandas(bio_test_df)
    cyber_test_dataset = Dataset.from_pandas(cyber_test_df)

    # Create separate DatasetDicts for bio and cyber
    bio_dataset = DatasetDict({
        "train": bio_train_dataset,
        "val": bio_val_dataset,
        "test": bio_test_dataset
    })

    cyber_dataset = DatasetDict({
        "train": cyber_train_dataset,
        "val": cyber_val_dataset,
        "test": cyber_test_dataset
    })
    bio_dataset.push_to_hub("PhillipGuo/wmdp-deduped", "wmdp-bio-retrain")
    cyber_dataset.push_to_hub("PhillipGuo/wmdp-deduped", "wmdp-cyber-retrain")

## Back to Relearning

In [5]:
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
def do_relearning(model, train_tasks, n_iters, grad_accum_steps=8, finetune_lora=False, lora_kwargs={'rank': 256, 'alpha': 32, 'dropout': 0.05, 'target_modules': 'all-linear'}, learning_kwargs={'lr': 1e-5, 'weight_decay': 0, 'use_cosine': False}, eval_callback_fn=None):
    # can either finetune full or lora

    if not finetune_lora:
        optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_kwargs['lr'], weight_decay=learning_kwargs['weight_decay'])

    elif finetune_lora:
        peft_config = LoraConfig(
            inference_mode=False,
            r=lora_kwargs['rank'],
            lora_alpha=lora_kwargs['alpha'],
            lora_dropout=lora_kwargs['dropout'],
            target_modules = lora_kwargs['target_modules'], #["q_proj", "v_proj", 
        )

        model = get_peft_model(model, peft_config).cuda()
        # model.print_trainable_parameters()
        print(f"Parameters in peft: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_kwargs['lr'], weight_decay=learning_kwargs['weight_decay'])
    
    if learning_kwargs['use_cosine']:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=n_iters)

    train_losses = defaultdict(list)
    test_losses = []

    for iter_idx in tqdm(range(n_iters)):
        log_dict = {}
        optimizer.zero_grad()
        for task_name, (task, task_weight) in train_tasks.items():
            task_loss = 0
            for i in range(grad_accum_steps):
                loss = task.get_train_loss(model) / grad_accum_steps
                task_loss += loss.item()
                (loss * task_weight).backward()
            train_losses[task_name].append(task_loss)
            log_dict[f"train_loss/{task_name}"] = task_loss

        optimizer.step()
        optimizer.zero_grad()
        if learning_kwargs['use_cosine']:
            scheduler.step()
            log_dict["learning_rate"] = scheduler.get_last_lr()[0]
            
        torch.cuda.empty_cache()

        if eval_callback_fn is not None:
            eval_metrics = eval_callback_fn(model, epoch=iter_idx)
            test_losses.append(eval_metrics)
            # Add eval metrics to wandb logging
            if eval_metrics:  # Only log when we actually have eval metrics
                for metric_name, value in eval_metrics.items():
                    log_dict[f"eval/{metric_name}"] = value
            print(test_losses[-1])
        
        # Log metrics to wandb
        wandb.log(log_dict, step=iter_idx+1)

    if len(test_losses) > 0:
        return train_losses, test_losses
    return train_losses


In [6]:
from tasks.general.DatasetTasks import PileTask
from tasks.wmdp.WMDP_UnlearnTask import WMDP_UnlearnTask, WMDP_UnlearnMCTask

train_batch_size = 2
train_bio_task = WMDP_UnlearnMCTask(batch_size=train_batch_size, tokenizer=tokenizer, subset="wmdp-bio", shuffle=False, split="first_two", train_test_split=True, criterion="cross_entropy", injection_task=False, model_type=model_type, filter_correct_prob_threshold=0.5)
train_pile_task = PileTask(batch_size=train_batch_size, tokenizer=tokenizer, stream_dataset=True, buffer_size=10000, ctx_length=100)
train_tasks = {"bio": (train_bio_task, .1), "pile": (train_pile_task, 1)}
relearning_regular_results = {}
n_relearn_iters = 100
model.cuda()

eval_batch_size = 8
eval_bio_task = WMDP_UnlearnMCTask(batch_size=eval_batch_size, tokenizer=tokenizer, subset="wmdp-bio", shuffle=False, split="first_two", train_test_split=True, criterion="cross_entropy", injection_task=False, model_type=model_type, filter_correct_prob_threshold=0.5)
num_train_iters = len(eval_bio_task.train_dataset) // eval_batch_size
num_test_iters = (len(eval_bio_task.test_dataset) * 4) // eval_batch_size

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

No test dataset available. Using train dataset for testing.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [7]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  17627 MiB |  17627 MiB |  17627 MiB |      0 B   |
|       from large pool |  17626 MiB |  17626 MiB |  17626 MiB |      0 B   |
|       from small pool |      1 MiB |      1 MiB |      1 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  17627 MiB |  17627 MiB |  17627 MiB |      0 B   |
|       from large pool |  17626 MiB |  17626 MiB |  17626 MiB |      0 B   |
|       from small pool |      1 MiB |      1 MiB |      1 MiB |      0 B   |
|---------------------------------------------------------------

In [8]:
from tasks.general.DatasetTasks import PileTask

print("num_train_iters: ", num_train_iters, "num_test_iters: ", num_test_iters)

evaluate_every = 4
grad_accum_steps = 16

def eval_callback(model, epoch, evaluate_every=evaluate_every):
    if (epoch+1) % evaluate_every == 0:
        mmlu_score = run_general_evals(model, model_type=model_type, evals_to_include=["MMLU"], verbose=False, batch_size=2, device="cuda")["MMLU"]
        train_bio_acc = eval_bio_task.get_test_accuracy(model, use_test_data=False, n_iters=num_train_iters)
        test_bio_acc = eval_bio_task.get_test_accuracy(model, use_test_data=True, n_iters=num_test_iters)
        return {"MMLU": mmlu_score, "train_bio_acc": train_bio_acc, "test_bio_acc": test_bio_acc}
    else:
        return {}

if is_lora:
    lr = 5e-6
else:
    lr = 5e-6
finetune_lora = False
wandb.init(
    project="sae-relearning",
    config={
        "model_name": model_name_or_path,
        "pretrained_path": pretrained_path,
        "lr": lr,
        "finetune_lora": finetune_lora,
        "n_iterations": n_relearn_iters,
        "grad_accum_steps": grad_accum_steps
    }
)

init_log_dict = {}
with torch.no_grad():
    initial_train_losses = {}
    for task_name, (task, task_weight) in train_tasks.items():
        task_loss = 0
        for i in range(grad_accum_steps):
            loss = task.get_train_loss(model) / grad_accum_steps
            task_loss += loss.item()
        initial_train_losses[task_name] = task_loss
        init_log_dict[f"train_loss/{task_name}"] = task_loss
print("initial_train_losses: ", initial_train_losses)
initial_test_loss = eval_callback(model, epoch=-1)
for metric_name, value in initial_test_loss.items():
    init_log_dict[f"eval/{metric_name}"] = value
print("Initial test evaluations: ", initial_test_loss)

wandb.log(init_log_dict, step=0)

train_losses, test_losses = do_relearning(model, train_tasks, n_iters=n_relearn_iters, grad_accum_steps=grad_accum_steps, finetune_lora=finetune_lora, learning_kwargs={'lr': lr, 'weight_decay': 0, 'use_cosine': True}, eval_callback_fn=eval_callback)

test_losses.insert(0, initial_test_loss)


wandb.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


num_train_iters:  4 num_test_iters:  17


[34m[1mwandb[0m: Currently logged in as: [33mphilliphguo[0m ([33mquirky_lats_at_mats[0m). Use [1m`wandb login --relogin`[0m to force relogin


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


initial_train_losses:  {'bio': 1.3984375, 'pile': 2.56494140625}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Initial test evaluations:  {'MMLU': 0.64, 'train_bio_acc': 0.272216796875, 'test_bio_acc': 0.3000919117647059}


  0%|          | 0/100 [00:00<?, ?it/s]

{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.66, 'train_bio_acc': 0.974609375, 'test_bio_acc': 0.8674172794117647}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.66, 'train_bio_acc': 0.9970703125, 'test_bio_acc': 0.8272058823529411}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.64, 'train_bio_acc': 0.9990234375, 'test_bio_acc': 0.8200827205882353}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.64, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8134191176470589}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.64, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8458180147058824}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.65, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8340992647058824}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.64, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8329503676470589}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.65, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8327205882352942}
{}
{}
{}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'MMLU': 0.64, 'train_bio_acc': 1.0, 'test_bio_acc': 0.8609834558823529}


KeyboardInterrupt: 