In [1]:
!nvidia-smi

Tue Nov  4 13:38:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%capture
# Install Unsloth - optimized for faster LLM training
!pip install unsloth

# Install required dependencies
!pip install --no-deps trl peft accelerate bitsandbytes

In [3]:
import torch
print(f"‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
print(f"‚úÖ CUDA version: {torch.version.cuda}")

# Try importing unsloth
try:
    from unsloth import FastLanguageModel
    print("‚úÖ Unsloth imported successfully!")
except Exception as e:
    print(f"‚ùå Error: {e}")

‚úÖ PyTorch version: 2.8.0+cu126
‚úÖ CUDA available: True
‚úÖ CUDA version: 12.6
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ Unsloth imported successfully!


In [23]:
import os

# Check if file exists
if os.path.exists('hider_raw.jsonl'):
    file_size = os.path.getsize('hider_raw.jsonl') / (1024 * 1024)  # Convert to MB
    print(f"‚úÖ File uploaded successfully!")
    print(f"   File size: {file_size:.2f} MB")

    # Count number of examples
    import json
    with open('hider_raw.jsonl', 'r') as f:
        num_examples = sum(1 for line in f)
    print(f"   Number of training examples: {num_examples}")
else:
    print("‚ùå File not found! Please upload hider_raw.jsonl")

‚úÖ File uploaded successfully!
   File size: 2.11 MB
   Number of training examples: 2000


In [24]:
import os
import json
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

DATA_FILE = 'hider_raw.jsonl'
MODEL_NAME = 'unsloth/gemma-3-270m-it'  # Unsloth's optimized Gemma 3 270M
OUTPUT_DIR = 'models/hider_sft'
MAX_SEQ_LENGTH = 2048
EPOCHS = 15
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
LORA_R = 16
LORA_ALPHA = 16

# ============================================================================
# STEP 1: LOAD MODEL AND TOKENIZER
# ============================================================================

print("="*70)
print("ü¶• LOADING GEMMA 3 270M WITH UNSLOTH")
print("="*70)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect best dtype
    load_in_4bit=False,  # 4-bit quantization for memory efficiency
)

print("‚úÖ Model loaded successfully!")
print(f"   Model: {MODEL_NAME}")
print(f"   Max sequence length: {MAX_SEQ_LENGTH}")
print(f"   Using 4-bit quantization")

# ============================================================================
# STEP 2: APPLY LORA
# ============================================================================

print("\n" + "="*70)
print("üéØ APPLYING LORA")
print("="*70)

model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized checkpointing
    random_state=42,
    use_rslora=False,
)

print("‚úÖ LoRA applied successfully!")
print(f"   LoRA rank: {LORA_R}")
print(f"   LoRA alpha: {LORA_ALPHA}")
print(f"   Target modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj")

# ============================================================================
# STEP 3: LOAD AND PREPARE DATASET
# ============================================================================

print("\n" + "="*70)
print("üìä LOADING DATASET")
print("="*70)

# Load JSONL file
dataset = load_dataset('json', data_files=DATA_FILE, split='train')
print(f"‚úÖ Loaded {len(dataset)} examples")

# Format dataset using chat template
def format_example(example):
    messages = example['messages']

    text = f"{messages[0]['content']}\n\n"  # System
    text += f"{messages[1]['content']}\n\n"  # User
    text += f"{messages[2]['content']}"  # Assistant

    # Add EOS token so model learns to stop generating
    text += tokenizer.eos_token

    return {"text": text}

print("   Formatting examples with chat template...")
dataset = dataset.map(format_example, remove_columns=dataset.column_names)

# Split into train/eval (90/10)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"‚úÖ Dataset prepared:")
print(f"   Training examples: {len(train_dataset)}")
print(f"   Evaluation examples: {len(eval_dataset)}")


ü¶• LOADING GEMMA 3 270M WITH UNSLOTH
==((====))==  Unsloth 2025.11.1: Fast Gemma3 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
‚úÖ Model loaded successfully!
   Model: unsloth/gemma-3-270m-it
   Max sequence length: 2048
   Using 4-bit quantization

üéØ APPLYING LORA
Unsloth: Making `model.base_model.model.model` require gradients
‚úÖ LoRA applied successfully!
   LoRA rank: 16
   LoRA alpha: 16
   Target module

Generating train split: 0 examples [00:00, ? examples/s]

‚úÖ Loaded 2000 examples
   Formatting examples with chat template...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

‚úÖ Dataset prepared:
   Training examples: 1800
   Evaluation examples: 200


In [25]:
# ============================================================================
# STEP 4: CONFIGURE TRAINING
# ============================================================================

print("\n" + "="*70)
print("‚öôÔ∏è CONFIGURING TRAINING")
print("="*70)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=8,  # Effective batch = 8 * 2 = 16
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",  # Unsloth optimized optimizer
    report_to="none",
    push_to_hub=False,
    save_total_limit=3,
)

print(f"‚úÖ Training configuration:")
print(f"   Epochs: {EPOCHS}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Gradient accumulation: 8 (effective batch size: 16)")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Optimizer: adamw_8bit")
print(f"   Scheduler: cosine")


‚öôÔ∏è CONFIGURING TRAINING
‚úÖ Training configuration:
   Epochs: 15
   Batch size: 4
   Gradient accumulation: 8 (effective batch size: 16)
   Learning rate: 0.0002
   Optimizer: adamw_8bit
   Scheduler: cosine


In [26]:
# ============================================================================
# STEP 5: CREATE TRAINER
# ============================================================================

print("\n" + "="*70)
print("üöÄ INITIALIZING TRAINER")
print("="*70)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,
)

print("‚úÖ Trainer initialized!")


üöÄ INITIALIZING TRAINER
Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1800 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/200 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úÖ Trainer initialized!


In [27]:
# ============================================================================
# STEP 6: TRAIN THE MODEL
# ============================================================================

print("\n" + "="*70)
print("üèãÔ∏è STARTING TRAINING")
print("="*70)
print("This will take approximately 15-30 minutes on T4 GPU")
print("You can monitor progress below...")
print("="*70 + "\n")

# Start training
trainer.train()

print("\n‚úÖ TRAINING COMPLETE!")


üèãÔ∏è STARTING TRAINING
This will take approximately 15-30 minutes on T4 GPU
You can monitor progress below...



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,800 | Num Epochs = 15 | Total steps = 855
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)


Step,Training Loss,Validation Loss
100,1.1381,1.099737
200,0.9131,0.926966
300,0.9875,0.935253
400,0.8794,0.938372
500,0.8332,0.802215
600,0.8911,0.802767
700,0.8048,0.776491
800,0.8036,0.778823



‚úÖ TRAINING COMPLETE!


In [1]:
# ============================================================================
# STEP 7: SAVE MODEL
# ============================================================================

print("\n" + "="*70)
print("üíæ SAVING MODEL")
print("="*70)

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save LoRA adapters
final_model_path = os.path.join(OUTPUT_DIR, "final_model")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"‚úÖ LoRA adapters saved to: {final_model_path}")

# Save merged 16-bit model
merged_model_path = os.path.join(OUTPUT_DIR, "final_model_merged_16bit")
model.save_pretrained_merged(merged_model_path, tokenizer, save_method="merged_16bit")
print(f"‚úÖ Merged 16-bit model saved to: {merged_model_path}")


üíæ SAVING MODEL


NameError: name 'os' is not defined

In [21]:
# # # Step 1 ‚Äî Zip the folder
!zip -r hider_sft.zip /content/models/hider_sft/

# Step 2 ‚Äî Download the ZIP
from google.colab import files
files.download("hider_sft.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>