In [4]:
import gc
from PIL import Image
import requests
import transformers
from transformers import AutoTokenizer, AutoProcessor, LlavaForConditionalGeneration, Blip2Model, Blip2Processor, Blip2ForConditionalGeneration, Blip2Config
import time
import torch
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from typing import Any, Optional, Tuple, Union
from transformers import TrainingArguments
import numpy as np
import evaluate
import wandb

from dataclasses import dataclass
import pandas as pd
from transformers import GPT2TokenizerFast

print(transformers.__version__)
model_id = "llava-hf/llava-1.5-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"

4.37.1


# Download Weights

In [None]:
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True,
    # load_in_4bit=True
).to(device)
print(f'Memory Allocated after instantiating models: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')

# Hyperparameters

In [None]:
training_args = TrainingArguments(
    output_dir='./test_trainer',     # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=5,   # batch size per device during training
    per_device_eval_batch_size=5,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

wandb.init(
    # set the wandb project where this run will be logged
    project="coffee-bot",
    
    # track hyperparameters and run metadata
    config={
        "learning_rate": 3e-4,
        "architecture": model.__class__.__name__,
        "dataset": "ROASTME-9540",
        "num_train_epoch": training_args.num_train_epochs,
        "warmup_steps": training_args.warmup_steps,
        "weight_decay": training_args.weight_decay,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "per_device_eval_batch_size": training_args.per_device_eval_batch_size,        
    }
)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, eps=10e-4)

In [16]:
prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
# url = "https://www.ilankelman.org/stopsigns/australia.jpg"
url = "https://i.redd.it/0o673i8z5bkb1.jpg"
label = "I failed a drug test just by looking at this picture."
raw_image = Image.open(requests.get(url, stream=True).raw)


inputs = processor(prompt, raw_image, return_tensors='pt')
labels = processor.tokenizer(label, return_tensors="pt")

# output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
# print(processor.decode(output[0][2:], skip_special_tokens=True))


pixel_values = inputs.pixel_values.to(device, dtype=torch.float16)
input_ids = inputs.input_ids.to(device, dtype=torch.long)
attention_mask = inputs.attention_mask.to(device, dtype=torch.long)
label_input_ids = labels.input_ids.to(device, dtype=torch.long)

output = model(
    pixel_values=pixel_values,
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=label_input_ids,
)
output.loss

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [12,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [13,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [14,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [15,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [16,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [17,0,0] Assertion `-sizes[i] <

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Read in Dataset

In [None]:
filename = '../datasets/full_ds.csv'
df = pd.read_csv(filename)
df.head()

# Test

In [None]:
def run_inference_on_row(index):
    row = df.iloc[index]
    label = row['comment']
    prompt = "USER: <image>\nHow would you describe this person?\nASSISTANT:"
    image_url = row['image_url']
    image = Image.open(requests.get(image_url, stream=True).raw)

    display(image)
    inputs = processor(prompt, image, return_tensors="pt", padding=True, max_length=256, truncation=True).to(device, torch.float16)

    # ids = reference_model.generate(**inputs)
    # print(f'REFERENCE: {processor.batch_decode(ids, skip_special_tokens=True)[0].strip()}')
    print(f'LABEL: {label}')

    generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
    return processor.decode(generated_ids[0][2:], skip_special_tokens=True)

In [None]:
run_inference_on_row(2000)

# Training Loop

In [None]:
# Prepare model for training
model.train()

# Dataset variables
prompt = "USER: <image>\nHow would you describe this person?\nASSISTANT:"
labels = []
image = None
submission_id = None
out = None
losses = []


with torch.autograd.detect_anomaly():
    for epoch in range(training_args.num_train_epochs):
        print(f'EPOCH #{epoch}')
        for index, row in df.iterrows():    
            comment = row['comment']
            image_url = row['image_url']
            
            if submission_id is None:
                submission_id = row['submission_id']
        
            if image is None:
                image = Image.open(requests.get(image_url, stream=True).raw)
        
            # We reached a new submission
            if row['submission_id'] != submission_id or len(labels) >= training_args.per_device_train_batch_size:        
                # Expand the inputs
                prompts = [prompt] * len(labels)
                image_inputs = [image] * len(labels)
    
                try:
                    
                    # Forward Pass 
                    print(f'Forward pass {len(prompts)} prompts and {len(labels)} labels ({row["submission_id"]})')
                    label_input_ids = processor.tokenizer(labels, return_tensors="pt", padding=True, max_length=256, truncation=True).input_ids.to(device, torch.float16)

                    print(inputs)
                    print(label_input_ids)
                    inputs = processor(text=prompt, images=image, return_tensors="pt", padding=True, max_length=256, truncation=True).to(device)
                
                    out = model(**inputs, labels=label_input_ids)
                    print(out.loss)
        
                    # Backprop (calculate gradients)
                    out.loss.backward()
        
                    # Update weights using gradients
                    optimizer.step()
                    
                    # Reset the gradients
                    optimizer.zero_grad()
                    print(f"Loss at step {index} = {out.loss.item()}")
    
                    losses.append(out.loss.item())
                    print(f'Memory Allocated after first pass : {torch.cuda.memory_allocated(0)/1e9:.4g} GB')
                    print("--------------")
                    
                    # Reset
                    submission_id = row['submission_id']
                    image = Image.open(requests.get(row['image_url'], stream=True).raw)
                    labels = [row['comment']]
                except ValueError as ve:
                    # Code to handle the ValueError
                    print(f"Error: {ve}")
                    print(f"INPUTS: {prompts}")
                    print(f"LABELS: {labels}")
                    break
            else:
                labels.append(comment)

model.save_pretrained("./llava_fine_tuned_model", from_pt=True) 

In [None]:
for index, row in df.iterrows():
    print(row['comment'])
    if index > 10:
        break

In [18]:
import torch
from transformers import LlavaForConditionalGeneration
# model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)

pixel_values = torch.randn(
    (2, 3, 336, 336),
    dtype=torch.float16
).to(device)
input_ids = torch.tensor(
    [
        [32001, 32001, 1, 15043,  7084, 32000, 29871,    13, 7900],
        [1, 15043,  7084, 29901, 29871, 32000, 29871,    13, 7900]
    ], dtype=torch.long
).to(device)
attention_mask = torch.tensor(
    [
        [0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]
    ], dtype=torch.long
).to(device)

print(pixel_values.shape)
print(input_ids.shape)
print(attention_mask.shape)
# model(
#     pixel_values=pixel_values,
#     input_ids=input_ids,
#     attention_mask=attention_mask,
#     labels=input_ids,
# ).loss

model.cpu()
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_cached()

print(f'Memory Allocated after instantiating models: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')

torch.Size([2, 3, 336, 336])
torch.Size([2, 9])
torch.Size([2, 9])


AttributeError: 'NoneType' object has no attribute 'cpu'

In [None]:
output.loss

In [17]:
!nvidia-smi

Tue Feb  6 21:19:58 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    48W / 400W |  26669MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces