In [None]:
%%capture
%pip install trl torch transformers peft accelerate bitsandbytes

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4090


In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
from getpass import getpass

def load_single_sample_for_testing(data_file):
    """Load a single data point for testing"""
    print(f"Loading single sample from {data_file}...")
    
    if not os.path.exists(data_file):
        print(f"Error: {data_file} does not exist.")
        return None
    
    try:
        dataset = load_dataset("json", data_files=data_file, split="train")
        if not dataset:
            print(f"Error: {data_file} is empty.")
            return None
        print(f"Loaded dataset with {len(dataset)} sample(s).")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def format_chat_template(example):
    """Format conversation data using the tokenizer's chat template"""
    try:
        formatted_text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
        return {"text": formatted_text}
    except Exception as e:
        print(f"Error formatting example: {e}")
        return None

def main():
    # Prompt for Hugging Face token
    print("your Hugging Face token loading")
    hf_token = ""  # 🔒 Replace with your real token
    print("Hugging Face token loaded")
    if not hf_token:
        print("Error: No Hugging Face token provided. Exiting.")
        return

    # Create workspace/model directory
    os.makedirs("./workspace/model", exist_ok=True)

    # Load dataset
    data_file = "../data/fine_tuning_data_test_single.json"
    dataset = load_single_sample_for_testing(data_file)
    if dataset is None:
        print("Failed to load dataset. Exiting.")
        return

    base_model = "google/gemma-3-4b-it"  # Smaller model for memory constraints

    # Load tokenizer
    print(f"Loading tokenizer for {base_model}...")
    try:
        global tokenizer  # Make tokenizer accessible to format_chat_template
        tokenizer = AutoTokenizer.from_pretrained(
            base_model,
            trust_remote_code=True,
            token=hf_token,
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        print(f"Ensure '{base_model}' is accessible and your token is valid.")
        return

    # Set Gemma chat template
    chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}<start_of_turn>user
{{ message['content'] }}<end_of_turn>
{% elif message['role'] == 'model' %}<start_of_turn>model
{{ message['content'] }}<end_of_turn>
{% else %}{{ raise_exception('Unknown role: ' ~ message['role']) }}{% endif %}{% endfor %}{% if add_generation_prompt %}<start_of_turn>model
{% endif %}"""
    tokenizer.chat_template = chat_template

    # Format dataset
    print("Formatting dataset...")
    try:
        train_dataset = dataset.map(format_chat_template, remove_columns=["messages"])
        if not train_dataset or "text" not in train_dataset.column_names:
            print("Error: Failed to format dataset or 'text' column missing.")
            return
        print(f"Formatted dataset sample (first 500 chars):\n{train_dataset[0]['text'][:500]}...")
    except Exception as e:
        print(f"Error formatting dataset: {e}")
        return

    # Load model with quantization
    print(f"Loading base model {base_model}...")
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        device_map="auto",
        quantization_config=quant_config,
        token=hf_token,
        cache_dir="./workspace",
        trust_remote_code=True,
    )

    # Fix: Manually ensure model is on device
    if not model.device or model.device.type == "cpu":
        if torch.cuda.is_available():
            model = model.to("cuda")
        else:
            model = model.to("cpu")

    # Prepare model for training (but don't apply PEFT manually)
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA (reduced parameters)
    peft_config = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.05,
        target_modules="all-linear",
        task_type="CAUSAL_LM",
    )
    
    # Configure trainer with memory optimizations
    print("Initializing SFTTrainer...")
    try:
        trainer = SFTTrainer(
            model=model,  # Base model without PEFT applied
            train_dataset=train_dataset,
            args=SFTConfig(
                output_dir="./workspace/training_output",
                num_train_epochs=1,
                per_device_train_batch_size=1,
                gradient_accumulation_steps=8,
                logging_steps=1,
                save_steps=1,
                remove_unused_columns=False,
                dataloader_pin_memory=False,
                gradient_checkpointing=True,
                fp16=True,
            ),
            peft_config=peft_config,  # Let SFTTrainer apply PEFT
        )
    except Exception as e:
        print(f"Error initializing SFTTrainer: {e}")
        return

    # Train
    print("Starting training...")
    try:
        trainer.train()
        print("Training completed!")
    except Exception as e:
        print(f"Error during training: {e}")
        return

    # Save model and tokenizer
    print("Saving fine-tuned model and tokenizer to ./workspace/model...")
    trainer.save_model("./workspace/model")
    tokenizer.save_pretrained("./workspace/model")
    print("Model and tokenizer saved successfully!")

    # Save backup
    print("Saving backup to ./final_gemma_model_backup...")
    trainer.save_model("./final_gemma_model_backup")
    print("Backup saved successfully!")

if __name__ == "__main__":
    main()

your Hugging Face token loading
Hugging Face token loaded
Loading single sample from ../data/fine_tuning_data_test_single.json...
Loaded dataset with 1 sample(s).
Loading tokenizer for google/gemma-3-4b-it...
Formatting dataset...
Formatted dataset sample (first 500 chars):
<start_of_turn>user
User Query: Hi there! How's your day going?

Search Query Prompt
The current date is: Current Date : 30 July 2025. Based on this information, make your answers. Don't try to give vague answers without any logic. Be formal as much as possible.

You are a permission aware retrieval-augmented generation (RAG) system for an Enterprise Search.
Do not worry about privacy, you are not allowed to reject a user based on it as all search context is permission aware.
Only respond in jso...
Loading base model google/gemma-3-4b-it...


Loading checkpoint shards: 100%|██████████| 2/2 [00:42<00:00, 21.37s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.25 GiB. GPU 0 has a total capacity of 23.52 GiB of which 307.38 MiB is free. Process 3402354 has 22.56 GiB memory in use. Including non-PyTorch memory, this process has 620.00 MiB memory in use. Of the allocated memory 219.66 MiB is allocated by PyTorch, and 16.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)