In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig,
    Trainer
    ,TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import os 
from datasets import load_dataset
import pandas as pd

os.environ['HUGGING_FACE_HUB_TOKEN'] = os.getenv("HUGGING_FACE_HUB_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Select model base
base_model = "Qwen/Qwen2.5-3B"
dataset_path = "VTSNLP/instruct_general_dataset"

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def prepare_dataset(tokenizer, max_length=2048):
    # Load and preprocess your instruction dataset
    dataset = load_dataset("VTSNLP/instruct_general_dataset")
    train_dataset = dataset['train'].select(range(10000))
    eval_dataset = dataset['train'].select(range(10000, 11000))
    
    
    def preprocess_function(examples):
        texts = [
             f"Instruction: {instruct}\nInput: {input}\nResponse: {output}"
            for instruct, input, output in zip(examples['instruct'], examples['input'], examples['output'])
        ]
        
        return tokenizer(
            texts,
            truncation=True,
            max_length=max_length,
            padding="max_length"
        )
    
    train_tokenized = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    
    eval_tokenized = eval_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=eval_dataset.column_names
    )
    
    return train_tokenized, eval_tokenized

In [None]:
train_dataset, eval_dataset = prepare_dataset(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./qwen-ft-results",
    num_train_epochs=3,
        per_device_train_batch_size=1,  # Increased since we have 80GB memory
        gradient_accumulation_steps=4,
        learning_rate=1e-5,
        weight_decay=0.01,
        warmup_steps=100,
        logging_steps=10,
        save_steps=100,
        eval_steps=100,
        evaluation_strategy="steps",
        bf16=True,
        gradient_checkpointing=True,
        dataloader_num_workers=4,
        group_by_length=True,
        save_total_limit=3,
        optim="adamw_torch_fused",  # Use fused AdamW for better performance
        lr_scheduler_type="cosine",  # Cosine learning rate scheduler
        max_grad_norm=1.0,  # Gradient clipping
    )
    
# Initialize trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    
# Start training
trainer.train()