In [None]:
!pip install -q transformers peft bitsandbytes torch pytorch-cuda=12.1 opencv-python pillow

import torch
import numpy as np
from pathlib import Path
import json

print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
import os
import json

# Find your data
data_path = Path("/kaggle/input")
print("Available input files:")
for item in sorted(data_path.rglob("*.mp4"))[:5]:
    print(f"  {item}")

# Load index
index_file = None
for f in data_path.rglob("index.json"):
    index_file = f
    break

if index_file:
    with open(index_file) as f:
        data = json.load(f)
    print(f"\nTraining samples: {data['total_samples']}")
    print(f"Sample: {data['samples'][0]}")

In [None]:
from transformers import AutoProcessor, Qwen2_5VLForConditionalGeneration, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

# 4-bit quantization config (CRITICAL for T4 VRAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

print("Loading Qwen2.5-VL-2B-Instruct...")
model = Qwen2_5VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-2B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-2B-Instruct")
print(f"âœ… Model loaded. Size: {model.get_memory_footprint() / 1e9:.2f}GB")

In [None]:
# Configure LoRA for fast fine-tuning
lora_config = LoraConfig(
    r=8,  # LoRA rank (low rank = less params to tune)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("âœ… LoRA adapter applied")
model.print_trainable_parameters()

In [None]:
class WarehouseDataset(torch.utils.data.Dataset):
    def __init__(self, processor, num_samples=20):
        self.processor = processor
        self.num_samples = num_samples
        self.operations = ["Box Setup", "Inner Packing", "Tape", "Put Items"]
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        op_idx = idx % len(self.operations)
        instruction = (
            f"Analyze this warehouse packaging video. "
            f"The main operation is {self.operations[op_idx]}. "
            f"What operation happens next? Choose from: {', '.join(self.operations)}"
        )
        response = f"The next operation is: {self.operations[(op_idx + 1) % len(self.operations)]}"
        
        # Combine instruction and response for training
        text = f"{instruction}\n{response}"
        
        # Tokenize
        tokenized = self.processor.tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )
        
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["input_ids"].squeeze().clone(),
        }

dataset = WarehouseDataset(processor, num_samples=20)
print(f"âœ… Dataset created: {len(dataset)} samples")
sample = dataset[0]
print(f"Sample keys: {sample.keys()}")
print(f"Input shape: {sample['input_ids'].shape}")

In [None]:
from transformers import TrainingArguments, Trainer, DefaultDataCollator

training_args = TrainingArguments(
    output_dir="/kaggle/working/checkpoint",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=3,
    warmup_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_32bit",
    report_to=[],  # Disable wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DefaultDataCollator(),
)

print("ðŸš€ Starting fine-tuning (8-10 hours)...")
trainer.train()
print("âœ… Training complete!")

In [None]:
import tarfile

# Save model
model.save_pretrained("/kaggle/working/qwen-lora-checkpoint")
processor.save_pretrained("/kaggle/working/qwen-lora-checkpoint")

print("âœ… Checkpoint saved to /kaggle/working/qwen-lora-checkpoint")
print("\nDownload these files from the Output panel:")
print("  - qwen-lora-checkpoint/")