In [None]:
#!pip install -q transformers datasets accelerate peft torchvision
# !pip install -U bitsandbytes

In [2]:
from huggingface_hub import notebook_login
notebook_login()  # Opens a widget to input token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#!pip freeze > requirements.txt

In [None]:
#!python --version

In [1]:
# %% [markdown]
# ### 2. Setup Environment
# %%
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide GPUs



import torch
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from PIL import Image
import os

In [None]:
# Check GPU configuration
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory/1024**3:.2f}GB")

In [2]:
# %% [markdown]
# ### 3. Load Model with Memory Optimizations
# %%

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = "float16" if torch.cuda.is_available() else "float32"

model_id = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_id)

# Load model with 8-bit quantization
base_model = Blip2ForConditionalGeneration.from_pretrained(
    model_id,
    # load_in_8bit=True,
    device_map="cpu",
    torch_dtype="float32",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Freeze base model layers
#for param in model.vision_model.parameters():
#    param.requires_grad = False
#for param in model.qformer.parameters():
#    param.requires_grad = False
#for param in model.language_projection.parameters():
#    param.requires_grad = False

for param in base_model.vision_model.parameters():
    param.requires_grad = False
    

# Add LoRA adapters
#lora_config = LoraConfig(
#    r=64,  # Reduced from 16 for memory savings
#    lora_alpha=32,
#    lora_dropout=0.05,
#    bias="all",
#    target_modules=["q_proj", "v_proj"]
#)

#lora_config = LoraConfig(
#    r=32,                   # Reduced rank to prevent overfitting
#    lora_alpha=64,          # Keep alpha=2*r (or try 16 if using r=8)
#    lora_dropout=0.1,       # Slightly higher dropout for regularization
#    bias="lora_only",       # Train biases only in LoRA layers (not all biases)
#    target_modules=[
#        "q_proj", "v_proj",
#        "k_proj",          # Add key projections for broader attention adaptation
#        "cross_attention"  # Target cross-attention layers if present in BLIP-2
#    ],
    # modules_to_save=["classifier"],  # Optional: Unfreeze final layer(s)
#)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj", "v_proj",
        "k_proj",          # Add key projections for broader attention adaptation
        "cross_attention"  # Target cross-attention layers if present in BLIP-2
    ],
    # modules_to_save=["lm_head"],
)


model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 3,932,160 || all params: 3,748,694,016 || trainable%: 0.1049


In [4]:
from torch.utils.data import Dataset, DataLoader
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt")
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["text"] = item["text"]
        return encoding

def collate_fn(batch):
    # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "text":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            questions = []
            answers = []
            for example in batch:
                full_text = example["text"]
                # Split at the first "Answer: " occurrence
                split_text = full_text.split("Answer: ", 1)
                if len(split_text) == 2:
                    questions.append(split_text[0] + "Answer: ")  # Keep prompt in input
                    answers.append(split_text[1])
                else:
                    print(f"Warning: format is not fine")
                    questions.append(full_text)  # Fallback if format is missing
                    answers.append("")

            text_inputs = processor.tokenizer(
                questions,
                padding="max_length",
                return_tensors="pt",
                truncation=True,  # Important for long questions
                max_length=128  # Match your model's max context
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]

            # Process answers as labels
            with processor.tokenizer.as_target_tokenizer():
                label_inputs = processor.tokenizer(
                    full_text,
                    padding="max_length",
                    return_tensors="pt",
                    truncation=True,
                    max_length=128
                )
                labels = label_inputs["input_ids"]
                # Mask question parts and padding tokens
                # labels[labels == processor.tokenizer.pad_token_id] = -100
            processed_batch["labels"] = labels

    return processed_batch

In [5]:

from datasets import load_dataset
# from sklearn.model_selection import train_test_split

dataset = load_dataset("Baran657/SnapGarden_v0.6", split="train")

#limited_dataset = []
#for i, example in enumerate(dataset):
#    if i < 100:
#        limited_dataset.append(example)
#    else:
#        break

In [None]:
#first_example = next(iter(dataset))

# Extract only the text
#first_text = first_example['text']
#print(first_text)

In [6]:
split_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = split_dataset["train"]
eval_data = split_dataset["test"]

In [7]:
train_dataset = ImageCaptioningDataset(train_data, processor)
eval_dataset = ImageCaptioningDataset(eval_data, processor)

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn, batch_size=1)

In [8]:
batchidx, batch = next(enumerate(train_dataloader))
#print(batch)
print("Input Text:", processor.decode(batch["input_ids"][0], skip_special_tokens=True))
print("Label Text:", processor.decode(batch["labels"][0], skip_special_tokens=True))

Input Text: Question: What type of soil does it prefer? Answer: 
Label Text: Question: What type of soil does it prefer? Answer: fast-draining succulent/cactus mix




In [9]:
def token_unlikelihood(logits, labels, gamma=0.5):
    probs = torch.nn.functional.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)

    # Identify repeated tokens (current == previous)
    repeat_mask = (preds[:, 1:] == preds[:, :-1]).float()
    repeat_mask = torch.cat([torch.zeros_like(repeat_mask[:, :1]), repeat_mask], dim=1)

    # Penalize probabilities of repeated tokens
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    ul_loss = -gamma * (repeat_mask * log_probs).mean()

    return ul_loss

def compute_loss_with_rep_penalty(outputs, labels, logits, penalty_weight=0.1):
    # 1. Standard cross-entropy loss
    ce_loss = outputs.loss

    # 2. Repetition penalty (penalize repeated consecutive tokens)
    shifted_logits = logits[:, :-1, :]  # Exclude last token
    shifted_labels = labels[:, 1:]      # Exclude first token

    # Calculate token repetitions in predictions
    preds = torch.argmax(shifted_logits, dim=-1)
    repeats = (preds[:, 1:] == preds[:, :-1]).float().mean()  # Avg repeats

    # 3. Combine losses
    total_loss = ce_loss + penalty_weight * repeats
    return total_loss, ce_loss, repeats

In [10]:
from torch.optim import SGD
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup

if device == "cpu":
    model = torch.compile(model, dynamic=False)
    

optimizer = AdamW(
    model.parameters(),
    lr=1e-4,  # Drastically reduce from 5e-4
    betas=(0.9, 0.999),
    weight_decay=0.01  # Increase regularization
)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

scaler = GradScaler()  # For mixed precision
model = model.to(device)

# 2. Enhanced Training Loop
best_val_loss = float('inf')
patience = 5
no_improvement = 0
accumulation_steps = 4

max_epochs = 100
total_steps = len(train_dataloader) * min(max_epochs, patience + 5)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=50,
    num_training_steps=total_steps
)

for epoch in range(max_epochs):  # Reduced epochs + early stopping
    model.train()
    total_loss = 0
    total_ce = 0
    total_repeats = 0

    for batch_idx, batch in enumerate(train_dataloader):
        # Mixed precision context
        with autocast(dtype):
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                labels=labels
            )
            logits = outputs.logits if hasattr(outputs, 'logits') else outputs['logits']
            loss, ce_loss, repeats = compute_loss_with_rep_penalty(outputs, labels, logits)

            loss = loss / accumulation_steps  # Normalize loss

        scaler.scale(loss).backward()

        # Log every 10 batches
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f"Epoch {epoch} | Batch {batch_idx} | Avg Loss: {avg_loss:.4f}")

        if (batch_idx+1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()  # Also tie scheduler to accumulation
        total_loss += loss.item() * accumulation_steps
        total_ce += ce_loss.item()
        total_repeats += repeats.item()

    # 3. Validation Phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
      for val_batch in eval_dataloader:
          pixel_values = val_batch["pixel_values"].to(device)
          input_ids = val_batch["input_ids"].to(device)
          labels = val_batch["labels"].to(device)

          outputs = model(
              pixel_values=pixel_values,
              input_ids=input_ids,
              labels=labels
          )
          val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Epoch {epoch} | Train Loss: {total_loss/len(train_dataloader):.4f} | Val Loss: {avg_val_loss:.4f}")
    print(f"Current LR: {scheduler.get_last_lr()[0]:.2e}")
    print(f"CE Loss: {total_ce/len(train_dataloader):.4f} | Repeat Penalty: {total_repeats/len(train_dataloader):.4f}")

    image = Image.open("1.jpg").convert("RGB")

    question = "Question: How often should I water it? Answer:"

    inputs = processor(image, question, return_tensors="pt").to("cuda")

    out = model.generate(**inputs,
                      max_length=80,
                      repetition_penalty=1.5,
                      length_penalty=1.0)
    print(processor.decode(out[0], skip_special_tokens=True).strip())

    print("-" * 50)

    # 4. Early Stopping & Checkpointing
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improvement = 0
        torch.save(model.state_dict(), f"best_model_epoch{epoch}.pt")
    else:
        no_improvement += 1

    if no_improvement >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

  scaler = GradScaler()  # For mixed precision
  with autocast(dtype):


Epoch 0 | Batch 0 | Avg Loss: 0.0000
Epoch 0 | Batch 10 | Avg Loss: 5.9400


KeyboardInterrupt: 

In [None]:
#from huggingface_hub import notebook_login
#notebook_login()  # Opens a widget to input token

model.eval()

image = Image.open("1.jpg").convert("RGB")

question = "Question: What is this plant? Answer:"

inputs = processor(image, question, return_tensors="pt").to("cuda")

out = model.generate(**inputs,
                  max_length=80,
                  repetition_penalty=1.5,
                  length_penalty=1.0)
print(processor.decode(out[0], skip_special_tokens=True).strip())



In [None]:
merged_model = model.merge_and_unload()

# Push model with your username
merged_model.push_to_hub("Baran657/blip_2_snapgarden")

# Push processor (optional but recommended)
processor.push_to_hub("Baran657/blip_2_snapgarden")

In [None]:
#model.merge_and_unload()

#model.eval()

#image = Image.open("1.jpg").convert("RGB")

#question = "Question: How often should I water it? Answer:"

#inputs = processor(image, question, return_tensors="pt").to("cuda")

#out = model.generate(**inputs,
#                      max_length=80,
#                      repetition_penalty=1.5,
#                      length_penalty=1.0)
#print(processor.decode(out[0], skip_special_tokens=True).strip())

#out = model.generate(pixel_values=pixel_values, input_ids=input_ids, return_tensors="pt")

#print(processor.decode(out[0], skip_special_tokens=True).strip())