In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from proccess import formatted_ds, tokenize_row

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# print(tokenized_ds.column_names)   # or for DatasetDict: print({k: v.column_names for k,v in tokenized_ds.items()})

In [4]:
# tokenized_ds = tokenized_ds.remove_columns(["text", "input", "output", "instruction"])
# print(tokenized_ds[0])

In [5]:
#load the model
model_name = "Featherless-Chat-Models/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [6]:
# Tokenize the formatted dataset using the loaded tokenizer and tokenize_row function
# This replaces the missing tokenized_ds

tokenized_ds = formatted_ds.map(lambda batch: tokenize_row(batch, tokenizer), batched=True)

# Optionally, remove unused columns if present
for col in ["text", "input", "output", "instruction"]:
    if col in tokenized_ds.column_names:
        tokenized_ds = tokenized_ds.remove_columns([col])

Map: 100%|██████████| 33955/33955 [00:01<00:00, 20594.93 examples/s]
Map: 100%|██████████| 33955/33955 [00:01<00:00, 20594.93 examples/s]


In [7]:
#ensure use of gpu
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization for even less memory
    llm_int8_enable_fp32_cpu_offload=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu",  # Load the model entirely on CPU to avoid OOM
    quantization_config=bnb_config,
)

model.resize_token_embeddings(len(tokenizer))
# Enable gradient checkpointing for further memory savings
model.gradient_checkpointing_enable()

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # LoRA rank, adjust for memory
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # typical for Mistral/LLMs
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Attach LoRA adapters to the quantized model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading weights: 100%|██████████| 291/291 [03:19<00:00,  1.46it/s, Materializing param=model.norm.weight]                              



trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [8]:
#training arguments
training_args = TrainingArguments(
    output_dir="./mistral_medical_model",
    per_device_train_batch_size=1,  
    gradient_accumulation_steps=4,  
    num_train_epochs=1,  
    learning_rate=0.00005,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=False,  # Disable fp16 since training is on CPU
    remove_unused_columns=False
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True
)

#trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,  # Use the tokenized dataset, not formatted_ds
    data_collator=data_collator
)


In [10]:
import torch
torch.cuda.empty_cache()


In [11]:
#train
trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['input', 'output', 'instruction', 'text']