In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from datasets import load_dataset
from src.script_arguments import ScriptArguments
from dataset import preprocess_dataset
import torch, wandb
import numpy as np
from dataset import preprocess_dataset, get_formatting_func, formatting_for_custom_loss
from src.script_arguments import ScriptArguments
from src.utils import normalize_question
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    AutoTokenizer,
    DataCollatorForLanguageModeling)



In [2]:
parser = HfArgumentParser(ScriptArguments)
args = parser.parse_dict({
    "dataset_name": "Atipico1/NQ-10k_preprocessed_with_o-u_case",
    "run_name": "value",
    "seq_length": 2048,
    "cbr": True,
    "cbr_original": 3,
    "cbr_unans": 0,
    "num_contexts": 5,
    "batch_size": 4,
    "unanswerable": False,
    "custom_loss":False,
})[0]

#args = parser.parse_args_into_dataclasses(args="--run_name NQ-cbr-unans-test --dataset_name Atipico1/NQ-10k_preprocessed_with_o-u_case")[0]

In [3]:
dataset = load_dataset(args.dataset_name, split="train")
dataset = preprocess_dataset(dataset, args)

Aggregating cases... (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

[39, 22, 27, 17, 23]
[]


Normalizing question... (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

Selecting contexts... (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name, trust_remote_code=True
)
response_template = "### A:"
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.unk_token
#collator = DataCollatorForCompletionOnlyLM(tokenizer.encode(response_template, add_special_tokens = False)[2:], tokenizer=tokenizer, mlm=False)
#formatting_func = get_formatting_func(args)
formatting_func = formatting_for_custom_loss
#max_length = max(tokenizer(formatting_func(dataset[:]), return_length=True)["length"])
#print("Max length: ", max_length)


In [5]:
bnb_config = BitsAndBytesConfig(
        load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit
    )
model = AutoModelForCausalLM.from_pretrained(
    args.model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    attn_implementation="flash_attention_2")
training_args = TrainingArguments(
    output_dir=args.output_dir+args.run_name,
    per_device_train_batch_size=args.batch_size,
    gradient_accumulation_steps=args.gradient_accumulation_steps,
    learning_rate=args.learning_rate,
    optim=args.optim,
    bf16=True,
    group_by_length=True,
    logging_steps=args.logging_steps,
    num_train_epochs=args.num_train_epochs,
    max_steps=args.max_steps,
    save_steps=args.save_steps,
    save_strategy=args.save_strategy,
    hub_strategy="checkpoint",
    save_total_limit=args.save_total_limit,
    push_to_hub=args.push_to_hub,
    hub_model_id=f"Atipico1/{args.run_name}",
    warmup_ratio=args.warmup_ratio,
    lr_scheduler_type=args.lr_scheduler_type,
    gradient_checkpointing=args.gradient_checkpointing)

lora_config = LoraConfig(
            r=args.peft_lora_r,
            lora_alpha=args.peft_lora_alpha,
            bias="none",
            task_type="CAUSAL_LM"
            )

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
class SquadDataCollator(DataCollatorForLanguageModeling):
    answer_start_token_id = 835  # "_```"
    case_answer_token_id = 4136
    def __call__(self, examples):
        batch = super().__call__(examples)
        for idx, label in enumerate(batch["labels"]):
            qa_answer_start = torch.where(label == 835)[0][-1]
            case_answer_token_ids = torch.where(label == 4136)[0]
            token_save = []
            label_copy = label.clone()
            for i in range(0, len(case_answer_token_ids), 2):
                case_start, case_end = case_answer_token_ids[i:i+2]
                token_save.append(label_copy[case_start+3:case_end])
            label[:qa_answer_start+3] = -100
            for i, tokens in zip(range(0, len(case_answer_token_ids), 2), token_save):
                case_start, case_end = case_answer_token_ids[i], case_answer_token_ids[i+1]
                label[case_start+3:case_end] = tokens
            batch["labels"][idx] = label
        return batch
collator = SquadDataCollator(tokenizer=tokenizer, mlm=False)

In [7]:
#collator = DataCollatorForCompletionOnlyLM(tokenizer.encode(response_template, add_special_tokens = False)[2:], tokenizer=tokenizer, mlm=False)
from dataset import old_formatting_for_cbr, old_formatting_for_original
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=lora_config,
        max_seq_length=args.seq_length,
        tokenizer=tokenizer,
        args=training_args,
        data_collator=collator,
        formatting_func=formatting_for_custom_loss,
        dataset_num_proc=8,
        dataset_batch_size=1000
    )

Map (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
formatting_for_custom_loss(dataset[:1])

['[CASE]\nBackground:\nDoc 0: On 11 December 1941 , Adolf Hitler and Nazi Germany declared war against the United States , the same day that the United States declared war on Germany and Italy .\nQ: when did the us join world war 2 in europe\n####A: 11 December 1941\n####\n\nBackground:\nDoc 0: The United Kingdom European Communities membership referendum , also known as the Referendum on the European Community ( Common Market ) , the Common Market referendum and EEC membership referendum took place on 5 June 1975 in the United Kingdom to gauge support for the country \'s continued membership of the European Communities ( EC ) -- often known at the time as the `` European Community \'\' and the `` Common Market \'\' which it had entered on 1 January 1973 under the Conservative government of Edward Heath under the provisions of the Referendum Act 1975 . Labour \'s manifesto for the October 1974 general election had promised that the people would decide `` through the ballot box \'\' whe

In [9]:
dataloader = trainer.get_train_dataloader()

In [10]:
token_ids = []
for idx, i in enumerate(dataloader):
    if idx == 10:
        break
    labels = i["labels"][0]
    token_id = []
    for l in labels:
        if l != -100:
            token_id.append(l)
    token_ids.append(token_id)
for tok in token_ids:
    print(tokenizer.decode(tok).replace("\n", " "))

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Huilgol Narayana Rao  poet Muhammad Iqbal  Leo Tolstoy  the saint Samarth Ramdas</s>
October 7 , 2018  August 14 through August 19 , 2017  from August 14 through August 19 , 2017  August 14, 2017</s>
Massachusetts gubernatorial election, 2018  November 4, 2008  Charles Barkley  Justin Fairfax (Democrat)</s>
United States Central Command  ferdinand foch  egypt  Rob Lockhart</s>
Bad Blood  James Bond  Pinkerton  Bom Diggy Diggy</s>
186  72  72  304 points</s>
Johnny Depp  Victor Joseph Garber  brigit forsyth  Jonathan Breck</s>
Johnny Depp  brigit forsyth  Victor Joseph Garber  Jim Barone</s>
emperor nero  europa  thetis  Daenerys</s>
bobby fischer  Armenian grandmaster Levon Aronian  Boris Gelfand  Krishnan Sasikiran</s>


In [11]:
for batch in dataloader:
    #inputs = tokenizer(batch, return_tensors="pt")
    loss = trainer.compute_loss(model, batch)
    print(loss)
    break

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


OutOfMemoryError: CUDA out of memory. Tried to allocate 120.00 MiB. GPU 0 has a total capacty of 79.15 GiB of which 101.25 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 75.92 GiB is allocated by PyTorch, and 2.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [35]:
token_ids = []
for idx, i in enumerate(dataloader):
    if idx == 2:
        break
    labels = i["labels"][0]
    print(labels)
    token_id = []
    for l in labels:
        if l != -100:
            token_id.append(l)
    token_ids.append(token_id)
for tok in token_ids:
    print(tokenizer.decode(tok).replace("\n", " "))

tensor([ -100,  -100,  -100,  ...,  8292, 17370,     2], device='cuda:0')
tensor([-100, -100, -100,  ..., 1682,  852,    2], device='cuda:0')
the saint Samarth Ramdas</s>
glucose</s>


In [23]:
for batch in dataloader:
    #inputs = tokenizer(batch, return_tensors="pt")
    loss = trainer.compute_loss(model, batch)
    print(loss)
    break

tensor(3.2826, device='cuda:0', grad_fn=<NllLossBackward0>)


In [52]:
tokenizer.encode(response_template, add_special_tokens = False)[2:]

[29901]

In [7]:
from torch.utils.data import DataLoader
dataloader_params = {
    "batch_size": 16,
    "collate_fn": collator,
    "num_workers": 1
}
loader = DataLoader(dataset, **dataloader_params)

In [9]:
for batch in loader:
    print(batch["input_ids"].shape)
    print(batch["labels"].shape)
    break

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/transformers/data/data_collator.py", line 45, in __call__
    return self.torch_call(features)
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/trl/trainer/utils.py", line 105, in torch_call
    batch = super().torch_call(examples)
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/transformers/data/data_collator.py", line 732, in torch_call
    batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
  File "/home/seongilpark/miniconda3/envs/exp/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3218, in pad
    raise ValueError(
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['question', 'answers', 'ctxs', 'original_case', 'unans_case', 'case']
