# Data

In [1]:
# !wget -c https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json

In [2]:
import json

with open('./llava_instruct_150k.json', 'r') as f:
    data = json.load(f)

In [3]:
data[0]

{'id': '000000033471',
 'image': '000000033471.jpg',
 'conversations': [{'from': 'human',
   'value': '<image>\nWhat are the colors of the bus in the image?'},
  {'from': 'gpt', 'value': 'The bus in the image is white and red.'},
  {'from': 'human',
   'value': 'What feature can be seen on the back of the bus?'},
  {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},
  {'from': 'human',
   'value': 'Is the bus driving down the street or pulled off to the side?'},
  {'from': 'gpt',
   'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}

In [20]:
import os
import json
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from transformers import AutoTokenizer

class LLaVADataset(Dataset):
    def __init__(self, json_path, image_dir, tokenizer, max_length=512):
        with open(json_path, 'r') as f:
            self.data = json.load(f)
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_transform = transforms.Compose([
            transforms.Resize((336, 336)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(self.image_dir, item['image'])
        image = Image.open(image_path).convert('RGB')
        image = self.image_transform(image)

        # Combine all conversation texts into a single string
        conversation_text = ""
        for conv in item['conversations']:
            if conv['from'] == 'human':
                conversation_text += f"Human: {conv['value']}\n"
            elif conv['from'] == 'gpt':
                conversation_text += f"Assistant: {conv['value']}\n"

        inputs = self.tokenizer(
            conversation_text,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )

        return {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

# Usage
json_path = './llava_instruct_150k.json'
image_dir = './coco/train2017/'
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')  # Replace with your tokenizer
tokenizer.pad_token = tokenizer.eos_token
tiny_dataset = LLaVADataset(json_path, image_dir, tokenizer)
dataloader = DataLoader(tiny_dataset, batch_size=1, shuffle=True)


In [21]:
for sample in dataloader:
    print(sample.keys())
    for k in sample.keys():
        print(sample[k].shape)
    break

dict_keys(['image', 'input_ids', 'attention_mask'])
torch.Size([1, 3, 336, 336])
torch.Size([1, 512])
torch.Size([1, 512])


# EELLaVA

In [22]:
import torch
import torch.nn as nn
class TinyLLAVA(nn.Module):
    def __init__(self, vision_encoder, projection_head, text_decoder, tokenizer, device="cuda"):
        super(TinyLLAVA, self).__init__()
        self.vision_encoder = vision_encoder
        self.projection_head = projection_head
        self.text_decoder = text_decoder
        self.tokenizer = tokenizer

        
        self.device = device

        self.vision_encoder.to(device)
        self.projection_head.to(device)
        self.text_decoder.to(device)
        for param in self.vision_encoder.parameters():
            param.requires_grad = False

    def forward(self, image, input_ids, attention_mask):
        # Extract visual features
        with torch.no_grad():
            visual_features = self.vision_encoder(image)  # Shape: (batch_size, vision_feature_dim)
    
        # Project visual features to text embedding space
        projected_features = self.projection_head(visual_features).to(device)  # Move to the same device
    
        # Embed input tokens
        token_embeddings = self.text_decoder.transformer.wte(input_ids).to(device)  # Shape: (batch_size, seq_len, embedding_dim)
    
        # Combine visual features with token embeddings
        combined_embeddings = torch.cat(
            [projected_features.unsqueeze(1), token_embeddings], dim=1
        ).to(device)
    
        # Adjust attention mask to include visual tokens
        _ones = torch.ones((attention_mask.size(0), 1)).to(device)
        extended_attention_mask = torch.cat(
            [_ones, attention_mask], dim=1
        ).to(device)
        
        outputs = self.text_decoder(
            inputs_embeds=combined_embeddings,
            attention_mask=extended_attention_mask
        )
    
        return outputs


In [23]:
from torchvision import models
import torch.nn as nn
import torch

# Load pretrained MobileNetV3
vision_encoder = models.mobilenet_v3_small()
num_features = vision_encoder.classifier[-1].in_features
vision_encoder.classifier[-1] = torch.nn.Linear(num_features, 768)

# Load your custom weights
pretrained_weights_path = './mobilenetv3_student_model.pth'
state_dict = torch.load(pretrained_weights_path, map_location=torch.device('cpu'), weights_only=True)
vision_encoder.load_state_dict(state_dict)

# Set to evaluation mode
vision_encoder.eval()

print("Vision Encoder Ready")

Vision Encoder Ready


  return self.fget.__get__(instance, owner)()


In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device="cuda"
# Load DistillGPT
llm = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [30]:
# Initialize Tiny-LLaVA
projection_head = nn.Linear(768, 768).to(device)
llm.lm_head = nn.Linear(in_features=768, out_features=32000) # llava out features
tiny_llava = TinyLLAVA(vision_encoder, projection_head, llm, llava_tokenizer).to("cuda")

In [31]:
# Ensure your student model (tiny_llava) is in evaluation mode
tiny_llava.eval()

# Loop through the dataloader
for sample in dataloader:
    # Print shapes of the data in the batch
    print("Image tensor shape:", sample["image"].shape)  # Shape: (batch_size, 3, 224, 224)
    print("Input IDs shape:", sample["input_ids"].shape)  # Shape: (batch_size, seq_len)
    print("Attention Mask shape:", sample["attention_mask"].shape)  # Shape: (batch_size, seq_len)

    # Move tensors to GPU
    images = sample["image"].to("cuda")  # Shape: (batch_size, 3, 224, 224)
    input_ids = sample["input_ids"].to("cuda")  # Shape: (batch_size, seq_len)
    attention_mask = sample["attention_mask"].to("cuda")  # Shape: (batch_size, seq_len)

    # Perform a forward pass
    with torch.no_grad():  # No gradient computation needed for inference
        outputs = tiny_llava(images, input_ids=input_ids, attention_mask=attention_mask)

    # Print output logits shape
    print("Output logits shape:", outputs.logits.shape)  # Shape: (batch_size, seq_len, vocab_size)

    # Optional: Decode the generated text
    generated_ids = outputs.logits.argmax(dim=-1)  # Greedy decoding (most probable tokens)
    generated_text = [
        tokenizer.decode(generated_id, skip_special_tokens=True)
        for generated_id in generated_ids
    ]
    print("Generated text:", generated_text)

    # Break after first batch
    break


Image tensor shape: torch.Size([1, 3, 336, 336])
Input IDs shape: torch.Size([1, 512])
Attention Mask shape: torch.Size([1, 512])
Output logits shape: torch.Size([1, 513, 32000])
Generated text: [' clergyclassified ETF ETF keptMill Opposition ETFMilleon BarkeonMill predominantly activismMillMillMill syntaxNextSamMill clergyNextNext excitementclassified activismMillMillMillNextMill excitementNext Hispanics excitement excitementNextMillNextMill clergyMillMillMill excitementSamMillMillplyNext LewMillMill syntax ETF clergyMill 52MillMillMillMillMillMillMillNextMill HeatherMillMill activismNextMillNextMill clergyNextMill HeatherMill ETFMillMillMillMillMillMillMill Aid Aid afterwardMillMillMillNextMillMillMill AidMillriblyMill Hispanics excitementMillMillMillottenhamMill excitement keptMillMillMillMillNOMillMillMill clergyMillNextMill champMillMillNext Aid Protein AidNextMill activismMillriblyMill clergy clergyNextNext Barry ETFNext excitement clergy cater Aid preseasonMillMill AidMill excit

# LLaVA Model

In [27]:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model

model_path = "liuhaotian/llava-v1.5-7b"

llava_tokenizer, llava_model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_base=None,
    model_name=get_model_name_from_path(model_path),
    load_in_8bit=True
)
device = "cuda"
llava_model.model = llava_model.model.to(device)
llava_model.model = llava_model.model.to(torch.float16)

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
large_dataset = LLaVADataset(json_path, image_dir, llava_tokenizer)
dataloader = DataLoader(large_dataset, batch_size=1, shuffle=True)

In [29]:
from PIL import Image
import torch
from torchvision import transforms

# Image preprocessing pipeline
image_transform = transforms.Compose([
    transforms.Resize((336, 336)),  # Resize to the expected input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Example: Forward pass with image and text
for sample in dataloader:
    # Extract text inputs
    input_ids = sample["input_ids"].to("cuda")  # Tokenized text input
    attention_mask = sample["attention_mask"].to("cuda")  # Attention mask

    # Extract and preprocess the image
    images = sample["image"].to("cuda").to(torch.float16)  # Assuming the dataloader already preprocesses images

    # Perform a forward pass through LLaVA
    llava_model.eval()
    with torch.no_grad():
        # with torch.
        outputs = llava_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            images=images  # Pass images to the model
        )

    # Print logits and their shape
    print("Logits shape:", outputs.logits.shape)  # Shape: (batch_size, seq_len, vocab_size)

    # Decode the generated text (optional, for validation purposes)
    generated_ids = outputs.logits.argmax(dim=-1)  # Greedy decoding
    generated_text = [
        tokenizer.decode(generated_id, skip_special_tokens=True)
        for generated_id in generated_ids
    ]
    print("Generated Text:", generated_text)

    # Break after the first sample for demonstration
    break


Logits shape: torch.Size([1, 142, 32000])
Generated Text: [' stagn implication.\'sing less or "ists\'sing tim especially conversion Bought spent\'s H representative eend gravitational.. masculct.. Marketing grillpic Victresingalause cle Bought spent\'s :\'sks Becauseer an kn opt Dangerous Records wasingcamera Dangerous80 exercise exercise Users eoor cle Boughtas diminish 47 undertcting cle Bought spent AV or holders eFunctionokllred biggest biggest Lib diminish# Boughtas hasher ± Misc35 second ± or (*ersasowersAccording Lilyograping MMA Dangerous Recordsas Culture waser cle Bought spent diminish underodeare Dangerous shut wasingcameraizes LiberalingAP or\'ll\'space eer ep Guys Hy terrifying diminishing cites diminish#.']
