In [1]:
# Set environment variable to disable the MPS high watermark limit.
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# Now import torch and check which device is available.
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device is available:", device)
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

MPS device is available: mps


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import torch

# Specify the model name. (Choose either instruct or base version.)
model_name = "Qwen/Qwen2.5-3B-Instruct"  # or "Qwen/Qwen2.5-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to(device)

# Example synthetic dataset creation (replace with your own data loading as needed)
import pandas as pd
from sklearn.model_selection import train_test_split

data = {
    "question": [
        "What is attention in neural networks?",
        "How do transformer models work?",
        "What are the advantages of self-attention?"
    ],
    "answer": [
        "Attention is a mechanism that allows models to focus on relevant parts of the input.",
        "Transformer models use self-attention and feed-forward layers to process sequences in parallel.",
        "Self-attention helps capture long-range dependencies and improves parallelization."
    ]
}

df = pd.DataFrame(data)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Define a tokenization function
def tokenize_function(example):
    text = f"Question: {example['question']}\nAnswer: {example['answer']}\n"
    return tokenizer(text, truncation=True, max_length=1024)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create TrainingArguments. Adjust batch sizes if necessary to avoid memory issues.
training_args = TrainingArguments(
    output_dir="model/qwen_finetuned",
    evaluation_strategy="steps",
    per_device_train_batch_size=1,    # Lower batch size for MPS if needed
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=False,   # MPS does not support fp16; we rely on bf16 if available
    bf16=True if torch.backends.mps.is_available() else False,
    report_to="none",
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    disable_tqdm=False,
)

print("TrainingArguments created successfully.")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.18s/it]

KeyboardInterrupt



In [None]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("Trainer created successfully. Ready to start training!")

# Start fine-tuning
trainer.train()