## Install & Import dependencies

In [None]:
# Install Libraries
!pip install -U bitsandbytes
!pip install transformers

In [None]:
# Imports
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoProcessor, LlavaForConditionalGeneration

## Load Dataset from HuggingFace

In [None]:
# Subset (Only for testing purpose)
data = load_dataset("AbdulMuqtadir/Doc_VQA_subset")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
        num_rows: 10
    })
    valid: Dataset({
        features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
        num_rows: 10
    })
    test: Dataset({
        features: ['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'],
        num_rows: 10
    })
})

## Load Processor and Model

In [None]:
# load processor
processor = AutoProcessor.from_pretrained('llava-hf/llava-1.5-7b-hf')

# Load model
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_8bit=True)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

### Preprocess function (Training)

In [None]:
training_dataset = data['train']

In [None]:
def preprocess_function_training(dataset):
    images = dataset["image_raw"]
    questions = dataset["question"]
    answers = dataset["answers"]

    batch_inputs = {"input_ids": [], "attention_mask": [], "pixel_values": [], "labels": []}

    for image, question, answer in zip(images, questions, answers):

        # Ensure answer is a string
        if isinstance(answer, list):
            answer = answer[0]

        # 1. Build training prompt
        prompt = (
            "USER: <image>\n"
            f"Question: {question}\n"
            "ASSISTANT:"
        )

        # 2. Tokenize prompt + image
        inputs = processor(
            text=prompt,
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=1024,         # encoder side
        )

        input_ids = inputs["input_ids"]               # [1,1024]
        attention_mask = inputs["attention_mask"]     # [1,1024]
        pixel_values = inputs["pixel_values"]         # [1,3,336,336]

        # 3. Tokenize answer
        answer_ids = processor.tokenizer(
            answer,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )["input_ids"]                                # [1,512]

        # 4. Create labels aligned with input_ids
        labels = torch.full_like(input_ids, -100)     # [1,1024]

        # number of tokens in prompt (excluding padding)
        prompt_len = attention_mask.sum().item()

        # place answer tokens *after* prompt tokens
        end_pos = prompt_len + answer_ids.shape[1]
        if end_pos > labels.shape[1]:
            end_pos = labels.shape[1]

        labels[:, prompt_len:end_pos] = answer_ids[:, : (end_pos - prompt_len)]

        # 5. Append to batch lists
        batch_inputs["input_ids"].append(input_ids[0])
        batch_inputs["attention_mask"].append(attention_mask[0])
        batch_inputs["pixel_values"].append(pixel_values[0])
        batch_inputs["labels"].append(labels[0])

        # Debug Prints
        print(f"input ids shape: {input_ids.shape}")
        print(f"attention mask shape: {attention_mask.shape}")
        print(f"pixel values shape: {pixel_values.shape}")
        print(f"label ids shape: {answer_ids.shape}")
        print(f"labels shape: {labels.shape}")
        print("-" * 50)

    # 6. Convert lists into tensors
    for k in batch_inputs:
        batch_inputs[k] = torch.stack(batch_inputs[k])

    return batch_inputs


In [None]:
processed_training_dataset = preprocess_function_training(training_dataset)

input ids shape: torch.Size([1, 1024])
attention mask shape: torch.Size([1, 1024])
pixel values shape: torch.Size([1, 3, 336, 336])
label ids shape: torch.Size([1, 512])
labels shape: torch.Size([1, 1024])
--------------------------------------------------
input ids shape: torch.Size([1, 1024])
attention mask shape: torch.Size([1, 1024])
pixel values shape: torch.Size([1, 3, 336, 336])
label ids shape: torch.Size([1, 512])
labels shape: torch.Size([1, 1024])
--------------------------------------------------
input ids shape: torch.Size([1, 1024])
attention mask shape: torch.Size([1, 1024])
pixel values shape: torch.Size([1, 3, 336, 336])
label ids shape: torch.Size([1, 512])
labels shape: torch.Size([1, 1024])
--------------------------------------------------
input ids shape: torch.Size([1, 1024])
attention mask shape: torch.Size([1, 1024])
pixel values shape: torch.Size([1, 3, 336, 336])
label ids shape: torch.Size([1, 512])
labels shape: torch.Size([1, 1024])
-----------------------

In [None]:
processed_training_dataset

{'input_ids': tensor([[32001, 32001, 32001,  ...,  9047, 13566, 29901],
         [32001, 32001, 32001,  ...,  9047, 13566, 29901],
         [32001, 32001, 32001,  ...,  9047, 13566, 29901],
         ...,
         [32001, 32001, 32001,  ...,  9047, 13566, 29901],
         [32001, 32001, 32001,  ...,  9047, 13566, 29901],
         [32001, 32001, 32001,  ...,  9047, 13566, 29901]]),
 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'pixel_values': tensor([[[[ 1.7698,  1.7698,  1.7698,  ...,  1.7698,  1.7552,  1.7552],
           [-0.8434, -0.8726, -0.8872,  ..., -0.8288, -0.8434, -0.8580],
           [-1.1499, -1.1791, -1.1937,  ..., -1.1791, -1.1645, -1.1791],
           ...,
           [-1.1061, -1.0769, -1.1499,  ..., -1.2667, -1.2521, -1.2229],
           [-1.1499, -1.1499, -1.2229,  ..., -1.2

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# 1️⃣ Wrap tensors into a TensorDataset
# Make sure the keys match what your preprocessing function returned
dataset = TensorDataset(
    processed_training_dataset["input_ids"],       # [num_samples, seq_len]
    processed_training_dataset["attention_mask"],  # [num_samples, seq_len]
    processed_training_dataset["pixel_values"],    # [num_samples, 3, 336, 336]
    processed_training_dataset["labels"]           # [num_samples, seq_len]
)

# 2️⃣ Create a DataLoader for batching
dataloader = DataLoader(
    dataset,
    batch_size=4,      # <-- batch size you wanted
    shuffle=True       # shuffle samples each epoch
)

# Check one batch
for batch in dataloader:
    input_ids, attention_mask, pixel_values, labels = batch
    print("Batch input_ids shape:", input_ids.shape)
    print("Batch attention_mask shape:", attention_mask.shape)
    print("Batch pixel_values shape:", pixel_values.shape)
    print("Batch labels shape:", labels.shape)
    break  # only check the first batch


Batch input_ids shape: torch.Size([4, 1024])
Batch attention_mask shape: torch.Size([4, 1024])
Batch pixel_values shape: torch.Size([4, 3, 336, 336])
Batch labels shape: torch.Size([4, 1024])


## Training Loop

In [None]:
# LR can be adjusted based on your GPU & batch size
learning_rate = 5e-5

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
EPOCHS = 30
for epochs in range(EPOCHS):
    for batch in dataloader:
        input_ids, attention_mask, pixel_values, labels = batch
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            labels=labels,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f'Epoch: {epochs}, Loss: {loss.item()}')


    break

KeyboardInterrupt: 

## Inference

In [None]:
test_sample1 = data['test'][0]

In [None]:
test_sample1.keys()

dict_keys(['question', 'docId', 'answers', 'data_split', 'bounding_boxes', 'word_list', 'image_raw', 'ground_truth'])

In [None]:
question = test_sample1['question']
image = test_sample1['image_raw']
answer = test_sample1['answers'][0]

In [None]:
def preprocess_function(examples):
    image = examples["image_raw"]
    question = examples["question"]

    # DocVQA-optimized prompt
    prompt = (
        "You are an AI assistant specialized in Document Question Answering. "
        "Analyze the document image and provide a concise answer.\n"
        "USER: <image>\n"
        f"Question: {question}\n"
        "ASSISTANT:"
    )

    inputs = processor(
        text=prompt,
        images=image,
        return_tensors="pt",
        max_length=2048,          # IMPORTANT: fixes your warning
        padding="max_length",
        truncation=True,
    )

    return inputs


In [None]:
preprocessed_sample = preprocess_function(test_sample1)

In [None]:
# Exploring the preprocessed sample

#preprocessed_sample.keys()

for key, value in preprocessed_sample.items():
    print(key, value.shape)

input_ids torch.Size([1, 2048])
attention_mask torch.Size([1, 2048])
pixel_values torch.Size([1, 3, 336, 336])


## Explore/Study the Processor and Model



### 1. Test Inference Code

In [None]:
# Imports
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load Model & Processor
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

# Prepareprompt with question
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What are these?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# Load Image
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)

# Process the input (The processor takes image (PIL object) & Prompt (with question inside prompt))
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

# Generate output (model )
output = model.generate(**inputs, max_new_tokens=200, do_sample=False)

# Decode the output of model to natural language
print(processor.decode(output[0][2:], skip_special_tokens=True))


`torch_dtype` is deprecated! Use `dtype` instead!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

ER:  
What are these? ASSISTANT: These are two cats lying on a pink couch.


### 2. Exploring Processor output

1. processor output is a dict with three keys
    
    - input_ids
    - attention_mask
    - pixel_values

2. shapes

In [None]:
print(f'Shape for input_ids: {inputs["input_ids"].shape}')            # (Batch size, tensor length) 2-dimensional
print(f'Shape for attention_mask: {inputs["attention_mask"].shape}')  # (Batch size, tensor length) 2-dimensional
print(f'Shape for pixel_values: {inputs["pixel_values"].shape}')      # (Batch size, channel, w, h) 4-dimensional


Shape for input_ids: torch.Size([1, 592])
Shape for attention_mask: torch.Size([1, 592])
Shape for pixel_values: torch.Size([1, 3, 336, 336])
