In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import requests
from PIL import Image
import sys

# Check if MPS (Metal Performance Shaders) is available
if not torch.backends.mps.is_available():
    print("MPS backend is not available. Please check your PyTorch installation or hardware compatibility.")
    sys.exit(1)  # Exit the script

# Use the MPS device
device = torch.device("mps")
print("Using MPS device for computation.")

# Load the model with MPS support
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float32  # Use float32 for compatibility with MPS
)
model.to(device)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Load the image from a URL
url = "https://www.health.com/thmb/eD_xfu60sILf0i_O9gPn8pbpWkM=/750x0/filters:no_upscale():max_bytes(150000):strip_icc()/Health-Atopic-Dermatitis-48.flexural-eczema22-96dcdcccdb154bfaa44b70f52e6ecc1c.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

# Prepare the input message
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {
                "type": "text",
                "text": "You are given an image related to some skin problems. Explain the cause and cure with suitable medications in detail."
            }
        ]
    }
]
text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    text=[text_prompt],
    images=[image],
    padding=True,
    return_tensors="pt"
)

# Move inputs to the MPS device
inputs = inputs.to(device)

# Perform inference
outputs = model.generate(**inputs)

# Decode and display the result
result = processor.decode(outputs[0], skip_special_tokens=True)
print(result)


In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset("mdwiratathya/ROCO-radiology")

In [None]:
from datasets import load_dataset

# Specify cache directory
cache_dir = "/Users/sambhavjha/Downloads/vlm_notebook/huggingface_cache"
dataset_path = "/Users/sambhavjha/Downloads/vlm_notebook/datasets/ROCO-radiology"

# Load the dataset
ds = load_dataset("mdwiratathya/ROCO-radiology", cache_dir=cache_dir)

# Save each split to disk
ds["train"].save_to_disk(f"{dataset_path}/train")
ds["validation"].save_to_disk(f"{dataset_path}/validation")
ds["test"].save_to_disk(f"{dataset_path}/test")

print(f"Dataset saved successfully at {dataset_path}.")


In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from peft import LoraConfig, get_peft_model

# Check if MPS is available
if not torch.backends.mps.is_available():
    raise EnvironmentError("MPS backend is not available. Ensure you're using a macOS system with M1/M2 GPU and PyTorch is installed correctly.")

# Set the device to MPS
device = torch.device("mps")
print(f"Using device: {device}")

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float32,  # Use float32 for MPS compatibility
    device_map=None  # Avoid auto device mapping
)
model = model.to(device)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "attn.qkv", "attn.proj",  # Vision attention layers
        "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"  # Decoder layers
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Move LoRA model to MPS
model = model.to(device)

# Check applied LoRA modules
print("Applied LoRA modules:")
print(model)


In [None]:
from torchvision.transforms import ToTensor, Resize, Compose
from datasets import load_dataset
from PIL import Image
from transformers import AutoProcessor
import torch
import gc

# === DEVICE SETUP ===
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# === LOAD DATASET ===
cache_dir = "/Users/sambhavjha/Downloads/vlm_notebook/huggingface_cache"  # Replace with a valid path
print("Loading dataset...")
ds = load_dataset("mdwiratathya/ROCO-radiology", cache_dir=cache_dir, streaming=True)

# === INITIALIZE PROCESSOR ===
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# === DEFINE PREPROCESSING FUNCTION ===
def preprocess_function(example):
    """
    Preprocess a single example for incremental processing.
    """
    transform = Compose([
        Resize((224, 224)),  # Resize image
        ToTensor()  # Convert to tensor
    ])

    # Convert grayscale image to RGB if needed
    image = example["image"]
    if image.mode != "RGB":
        image = image.convert("RGB")

    # Resize and convert to tensor
    image = transform(image)

    # Process inputs with the processor
    inputs = processor(
        images=image,
        text=example["caption"],  # Corresponding caption
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    return {
        "input_ids": inputs["input_ids"].squeeze(0).to(device),
        "attention_mask": inputs["attention_mask"].squeeze(0).to(device),
        "pixel_values": inputs["pixel_values"].squeeze(0).to(device),
    }

# === PROCESS DATASETS ===
def process_stream(dataset_stream, batch_size=16):
    """
    Incrementally process the dataset in batches to handle the full dataset.
    """
    batch = []
    for i, example in enumerate(dataset_stream):
        batch.append(example)

        # Process in batches
        if len(batch) == batch_size:
            yield [preprocess_function(ex) for ex in batch]
            batch = []  # Clear the batch

            # Periodically free up memory
            gc.collect()
            torch.mps.empty_cache()

            # Log progress
            print(f"Processed {i + 1} samples...")

    # Process any remaining examples
    if batch:
        yield [preprocess_function(ex) for ex in batch]

# === PROCESS FULL DATASETS ===
print("Processing train dataset...")
train_processed = []
for processed_batch in process_stream(ds["train"], batch_size=16):
    train_processed.extend(processed_batch)

print("Processing validation dataset...")
val_processed = []
for processed_batch in process_stream(ds["validation"], batch_size=16):
    val_processed.extend(processed_batch)

print("Processing complete!")


In [None]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    print(f"Epoch {epoch + 1}")
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            pixel_values=batch["pixel_values"],
            labels=batch["input_ids"]  # Teacher forcing
        )
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())


In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# Check if MPS is available for Mac M1/M2
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float32,  # Use float32 for MPS compatibility
    device_map=None
)
model = model.to(device)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "visual.blocks.*.attn.qkv", "visual.blocks.*.attn.proj",  # Vision attention layers
        "model.layers.*.self_attn.q_proj", "model.layers.*.self_attn.k_proj", 
        "model.layers.*.self_attn.v_proj", "model.layers.*.self_attn.o_proj",  # Decoder self-attention layers
        "model.layers.*.mlp.gate_proj", "model.layers.*.mlp.up_proj", "model.layers.*.mlp.down_proj"  # Decoder MLP layers
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
model = model.to(device)

print("Applied LoRA modules:")
print(model)