In [None]:
!pip3 install bitsandbytes peft trl datasets transformers --upgrade

In [None]:
import json
import os
import torch
import warnings
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
from PIL import Image

In [16]:
warnings.filterwarnings("ignore")
os.environ["WANDB_DISABLED"] = "true"  # Replaced by report_to="none"

# Set matrix multiplication precision to avoid BFloat16
torch.set_float32_matmul_precision('medium')

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"  # base model
EPOCHS = 10
BATCH_SIZE = 1
LEARNING_RATE = 2e-5
MAX_SEQ_LEN = 512  # Increased to avoid truncation error

# Clear GPU memory
torch.cuda.empty_cache()

Using device: cuda


In [4]:
with open("/kaggle/input/test-kaggle-json/test_kaggle.json", "r") as f:
    train_raw = json.load(f)

with open("/kaggle/input/train-kaggle-json/train_kaggle.json", "r") as f:
    test_raw = json.load(f)
system_message = """You are a highly advanced Vision Language Model (VLM).
Analyze the facial expression in the given image and output the emotion.
Possible emotions: Natural, anger, fear, joy, sadness, surprise."""
#/kaggle/input/autistic-children-emotions-dr-fatma-m-talaat/Autistic Children Emotions - Dr. Fatma M. Talaat
def format_sample(sample):
    return [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        sample["messages"][0],  # user: image + text
        sample["messages"][1],  # assistant: label
    ]

train_dataset = [format_sample(s) for s in train_raw]
test_dataset = [format_sample(s) for s in test_raw]
eval_dataset = train_dataset[:10]  # Small eval subs

In [None]:
# Load model and processor with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # Use float16 to avoid BFloat16 mismatch
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    dtype=torch.float16,  # Ensure float16 for computations
    device_map="auto",  # Still use for multi-GPU if available, but quantization avoids meta tensors
    use_cache=False
)
model.config.torch_dtype = torch.float16  # Ensure float16 in config

processor = Qwen2VLProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right"
print("Model dtype:", next(model.parameters()).dtype)

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Model dtype: torch.float16


In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 8,293,898,752 || trainable%: 0.0304


In [None]:
training_args = SFTConfig(
    output_dir="./output",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_checkpointing=True,  # Re-enabled with quantization (safer now)
    learning_rate=LEARNING_RATE,
    logging_steps=50,
    eval_steps=50,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=50,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    max_grad_norm=1,
    warmup_steps=0,
    dataset_kwargs={"skip_prepare_dataset": True},
    remove_unused_columns=False,
    optim="paged_adamw_32bit",
    report_to="none",  # Fix WANDB warning
    gradient_accumulation_steps=4  # Manage memory
)

def preprocess_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))  # Resize to reduce token count
        return image
    except Exception as e:
        print(f"Error preprocessing image {image_path}: {e}")
        return None

def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]
    image_inputs = [preprocess_image(example[1]["content"][0]["image"]) for example in examples]
    image_inputs = [img for img in image_inputs if img is not None]  # Filter out failed images

    if not image_inputs: 
        raise ValueError("No valid images in batch")
    
    batch = processor(
        text=texts[:len(image_inputs)],
        images=image_inputs,
        return_tensors="pt",
        padding=True,
        truncation=False,
        max_length=None
    )
    batch["labels"] = batch["input_ids"].clone()
    batch["labels"][batch["labels"] == processor.tokenizer.pad_token_id] = -100
    return batch

trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer,
)

In [None]:
def generate_text(sample_data, model, processor, max_new_tokens=20):
    #print("debugging")
    try:
        user_content = sample_data[1]["content"]
        image_path = user_content[0]["image"]
        query_text = user_content[1]["text"]
        actual_answer = sample_data[2]["content"][0]["text"]
    except Exception as e:
        print("Error extracting sample_data:", e)
        return None, None

    # print("Image path:", image_path)
    # print("Query text:", query_text)
    # print("Actual answer:", actual_answer)

    # System message
    system_msg = (
        "You are a highly advanced Vision Language Model (VLM). "
        "Analyze the facial expression in the given image and output ONLY the emotion name. Do not add any extra text. "
        "Possible emotions: Natural, anger, fear, joy, sadness, surprise."
    )

    # Build prompt
    prompt_text = processor.apply_chat_template(
        [
            {"role": "system", "content": [{"type": "text", "text": system_msg}]},
            {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": query_text}]}
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    # Load image and prepare inputs
    try:
        image = Image.open(image_path)
    except Exception as e:
        print(f"Error loading image {image_path}:", e)
        return None, None

    inputs = processor(text=[prompt_text], images=[image], return_tensors="pt", padding=True)

    # Print model's datatype
    # model_dtype = next(model.parameters()).dtype
    # print("Model dtype:", model_dtype)

    # Cast floats to float16 (consistent with quantization)
    for k, v in inputs.items():
        if torch.is_floating_point(v):
            inputs[k] = v.to(dtype=torch.float16)
        #print(f"Input tensor '{k}' dtype after move:", inputs[k].dtype)

    # Generate output with greedy decoding
    try:
        model.eval()
        with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Greedy decoding for clean output
                temperature=0.1,  # Low temperature for deterministic output
                pad_token_id=processor.tokenizer.eos_token_id
            )
            # Decode and extract only the assistant's response
            output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            # Extract after last "assistant" tag
            if "assistant" in output_text:
                output_text = output_text.split("assistant")[-1].strip()
            output_text = output_text.split()[0].strip() if output_text.split() else output_text
    except Exception as e:
        print("Error during generation:", e)
        output_text = None

    return output_text, actual_answer

In [19]:
trainer.train()

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,8.8195,8.526506,3.128513,31739.0,0.359292
100,8.1164,7.798287,3.706689,63319.0,0.367461
150,7.2225,6.655527,4.689507,95060.0,0.373742


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=190, training_loss=7.691737365722656, metrics={'train_runtime': 1743.2957, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.109, 'total_flos': 5589902474618880.0, 'train_loss': 7.691737365722656, 'epoch': 10.0})

In [None]:
# import zipfile
# import os
# zip_path = "/kaggle/input/archive-zip"
# extract_dir = "/kaggle/input/dataset"
# os.makedirs(extract_dir, exist_ok=True)
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)
# print("Files unzipped to:", extract_dir)