# Setup

In [2]:
import gc
from PIL import Image
import requests
from transformers import AutoTokenizer, AutoProcessor, LlavaForConditionalGeneration, Blip2Model, Blip2Processor, Blip2ForConditionalGeneration, Blip2Config
import time
import torch
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from typing import Any, Optional, Tuple, Union

from dataclasses import dataclass
import pandas as pd
from transformers import GPT2TokenizerFast

# Load Model Weights

## LLaVA

In [None]:
# model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True,).to(0)
# processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

## BLIP2

In [None]:
# Download BLIP2 weights
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16)

# Inference

In [None]:
prompt = "<image>\nUSER: What is in the image?\nASSISTANT:"
label_text = '</s> The image is of two cats laying on a couch with remotes on the couch\n'
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

## One loss

In [None]:
yb = ['</s> The image is of two cats laying on a couch with remotes on the couch\n', ' The image is of two cats laying on a couch with remotes on the couch\n', '</s>The image is of two cats laying on a couch with remotes on the couch\n', 'Swagger', ' The', '</s> The']

for label_text in yb: 
    inputs = processor(image, prompt, return_tensors="pt").to(0, torch.float16)
    label_input_ids = processor.tokenizer.encode(label_text, return_tensors="pt")
    print(inputs.input_ids.shape)
    generated_ids = model(pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, labels=label_input_ids)

    final_logit_layer = generated_ids.logits[:,-1,:]
    max_token_prob = F.softmax(final_logit_layer).argmax()
    prediction = processor.batch_decode(max_token_prob.unsqueeze(0), skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(f'Prediction {prediction} vs. {label_text} has loss: {generated_ids.loss}')

## Matrix loss

In [None]:
prompt = "Question: What is in the image? Answer:"
label_texts = [
    '</s> The image is of two cats laying on a couch with remotes on the couch\n',
    ' The image is of two cats laying on a couch with remotes on the couch\n',
    'Swagger',
    ' The',
    '</s> The'
][:3]

tokens = [processor.tokenizer.encode(s, return_tensors="pt", padding="max_length", max_length=512) for s in label_texts]
print(label_texts)
print(type(label_texts))
# processor.tokenizer.encode(label_texts, return_tensors="pt", padding="max_length", max_length=512)
processor.tokenizer(label_texts, return_tensors="pt", padding="max_length", max_length=512).input_ids.shape

In [None]:
!nvidia-smi
# !nvidia-smi --gpu-reset

In [None]:
# Clear GPU
torch.cuda.empty_cache()
del generated_ids
del model
del processor
del forward_ids
del final_layer_logits
torch.cuda.empty_cache()

# Test Section

In [None]:
inputs = processor(image, prompt, return_tensors="pt").to(0, torch.float16)
forward_ids = model(pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
# generated_ids = model.generate(pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, max_length=200, do_sample=False)

In [None]:
final_layer_logits = forward_ids.logits[:, -1, :]
probs = F.softmax(final_layer_logits)
idx = torch.argmax(probs)
processor.decode(idx)

print(idx)

# Training?

In [None]:
from transformers import TrainingArguments
import numpy as np
import evaluate
import wandb

In [None]:
training_args = TrainingArguments(
    output_dir='./test_trainer',     # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [None]:
# training_args.num_train_epochs
model.__class__.__name__

# TO-DOs

- [] Compute Metrics. Are we just using NTP loss? AKA cross entropy
- [] Tune/expose hyperparameters for configuration
- [] Setup with W&Bs
- [] Build training and eval datasets.

# W & B

In [None]:
# Already handled
# !wandb login

wandb.init(
    # set the wandb project where this run will be logged
    project="coffee-bot",
    
    # track hyperparameters and run metadata
    config={
        "learning_rate": 0.02,
        "architecture": model.__class__.__name__,
        "dataset": "CIFAR-100",
        "num_train_epoch": training_args.num_train_epochs,
        "warmup_steps": training_args.warmup_steps,
        "weight_decay": training_args.weight_decay,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "per_device_eval_batch_size": training_args.per_device_eval_batch_size,        
    }
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# I'M NOT CRAZY

In [None]:
inputs = processor(image, prompt, return_tensors="pt").to(0, torch.float16)
print(inputs.input_ids)

test = torch.tensor([[81.0]]).to(0, torch.float16)

inputs.input_ids = inputs.input_ids.to(torch.float16)

print(test)
print(inputs.input_ids)
torch.cat((inputs.input_ids, test), dim=1)
# torch.cat((inputs.input_ids, test, dim=1)

# NEW

In [None]:
pred = None
idx = None
curr_prompt = prompt
input_ids = None
res_ids = None

# While pred != '\n'

start_time = time.time()

# Get image_embedding and project
input_ids, language_model_inputs, vision_outputs = image_encoding(model=model, processor=processor, curr_prompt=curr_prompt)

while idx != 50118:

    # Forward pass
    forward_ids = forward_from_image_query_output(model=model, language_model_inputs=language_model_inputs, input_ids=input_ids)

    # Get the final layer of logits
    final_layer_logits = forward_ids['logits'][:, -1, :]

    # Get prediction
    probs = F.softmax(final_layer_logits)
    idx = torch.argmax(probs)

    # Convert to shape
    idx = torch.tensor([[idx.item()]]).to('cuda')

    if res_ids is None:
        res_ids = idx
    else:
        res_ids = torch.cat([res_ids, idx], dim = 1)
        
    input_ids = torch.cat([input_ids, idx], dim = 1)
    # print(processor.batch_decode(res_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

end_time = time.time()
print(res_ids)
print(f'Took {end_time - start_time} seconds')
processor.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Utils

In [None]:
def forward_from_image_query_output(
    model,
    language_model_inputs: torch.FloatTensor,
    input_ids: torch.FloatTensor,
    attention_mask: Optional[torch.LongTensor] = None,
    labels: Optional[torch.LongTensor] = None,
):
    output_attentions = None
    output_hidden_states = None
    return_dict = model.config.use_return_dict
    
    # step 3: use the language model, conditioned on the query outputs and the prompt
    language_model_attention_mask = torch.ones(
        language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
    )
    inputs_embeds = model.language_model.get_input_embeddings()(input_ids)
    inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

    if attention_mask is None:
        attention_mask = torch.ones_like(input_ids)
    expected_device = language_model_attention_mask.device
    attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)

    outputs = model.language_model(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    logits = outputs.logits if return_dict else outputs[0]
    loss = None
    # we compute the loss here since we need to take into account the sequence length of the query embeds
    if labels is not None:
        labels = labels.to(logits.device)
        logits = logits[:, -labels.size(1) :, :]
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous().to(logits.device)

        # Flatten the tokens
        loss_fct = CrossEntropyLoss(reduction="mean")

        loss = loss_fct(shift_logits.view(-1, model.config.text_config.vocab_size), shift_labels.view(-1))

    if not return_dict:
        output = (logits, outputs)
        return ((loss,) + output) if loss is not None else output

    return {
        'loss': loss,
        'logits': logits,
        'language_model_outputs': outputs,
    }

def image_encoding(
    model,
    processor,
    image,
    curr_prompt
):  
    inputs = processor(image, curr_prompt, return_tensors="pt").to(0, torch.float16)
    print(inputs)
    pixel_values = inputs.pixel_values
    input_ids = inputs.input_ids
    
    decoder_input_ids = None
    decoder_attention_mask = None
    output_attentions = None
    output_hidden_states = None
    labels = None
    return_dict = model.config.use_return_dict

    # step 1: forward the images through the vision encoder,
    # to get image embeddings of shape (batch_size, seq_len, hidden_size)
    vision_outputs = model.vision_model(
        pixel_values=pixel_values,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    image_embeds = vision_outputs[0]
    print(f'Image Embedds: {image_embeds.shape}')

    # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

    query_tokens = model.query_tokens.expand(image_embeds.shape[0], -1, -1)
    query_outputs = model.qformer(
        query_embeds=query_tokens,
        encoder_hidden_states=image_embeds,
        encoder_attention_mask=image_attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    query_output = query_outputs[0]

    # step 3: use the language model, conditioned on the query outputs and the prompt
    return input_ids, model.language_projection(query_output), vision_outputs

def custom_forward(
    model,
    pixel_values: torch.FloatTensor,
    input_ids: torch.FloatTensor,
    attention_mask: Optional[torch.LongTensor] = None,
    decoder_input_ids: Optional[torch.LongTensor] = None,
    decoder_attention_mask: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    labels: Optional[torch.LongTensor] = None,
    return_dict: Optional[bool] = None,
):
    vision_outputs = model.vision_model(
        pixel_values=pixel_values,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    image_embeds = vision_outputs[0]
    print(f'Image Embeds: {image_embeds.shape}')

    # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
    image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)

    query_tokens = model.query_tokens.expand(image_embeds.shape[0], -1, -1)
    query_outputs = model.qformer(
        query_embeds=query_tokens,
        encoder_hidden_states=image_embeds,
        encoder_attention_mask=image_attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    query_output = query_outputs[0]

    print(f'Query Output: {query_output.shape}')

    # step 3: use the language model, conditioned on the query outputs and the prompt
    language_model_inputs = model.language_projection(query_output)
    language_model_attention_mask = torch.ones(
        language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
    )
    print(f'Input Ids: {input_ids.shape}')
    inputs_embeds = model.language_model.get_input_embeddings()(input_ids)
    print(f'Input Embeds: {inputs_embeds.shape}')
    print(f'language_model_inputs: {language_model_inputs.shape}')
    inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

    if attention_mask is None:
        attention_mask = torch.ones_like(input_ids)
    expected_device = language_model_attention_mask.device
    attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)

    if model.config.use_decoder_only_language_model:
        outputs = model.language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        logits = outputs.logits if return_dict else outputs[0]
        loss = None
        # we compute the loss here since we need to take into account the sequence length of the query embeds
        if labels is not None:
            labels = labels.to(logits.device)
            logits = logits[:, -labels.size(1) :, :]
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous().to(logits.device)

            # Flatten the tokens
            loss_fct = CrossEntropyLoss(reduction="mean")
            print(f'Calculating loss with shift_logits {shift_logits.shape} and {shift_labels.shape}')
            loss = loss_fct(shift_logits.view(-1, model.config.text_config.vocab_size), shift_labels.view(-1))
    else:
        outputs = model.language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            labels=labels,
        )
        loss = outputs.loss if return_dict else outputs[0]
        logits = outputs.logits if return_dict else outputs[1]

    # if not return_dict:
    #     output = (logits, vision_outputs, query_outputs, outputs)
    #     return ((loss,) + output) if loss is not None else output

    return {
        'loss': loss,
        'logits': logits,
        'vision_outputs': vision_outputs,
        'qformer_outputs': query_outputs,
        'language_model_outputs': outputs,
    }

# Need loss
- How to
Read in 

In [None]:
df = pd.read_csv('../datasets/full_ds.csv')
df.head()

In [None]:
tokenized_inputs = df['comment'].astype(str).apply(lambda str: str[:500]).apply(lambda str: processor.tokenizer.encode(str, padding="max_length",max_length=512))
input_ids = torch.tensor(tokenized_inputs)

In [None]:
input_ids.shape

In [None]:
def build_batch(comments):
    xb = [] # B,T,C --> [1, 512, ?]
    yb = [] # B,T,1 

    for comment in comments:
        context = []
        
        # tokenize the comment
        print(comment)
        tokenized = processor.tokenizer.encode(comment, padding="max_length", max_length=512)
        print(tokenized)
        
        # for token
        for idx, t in enumerate(tokenized):
            if idx == 0:
                continue
                
            context = tokenized[0:idx]
            label = tokenized[idx:idx + 1]

            xb.append(context)
            yb.append(label)
            
    return xb, yb

# JUST DO IT

In [3]:
# Download BLIP2 weights
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16)
print(f'Memory Allocated after instantiating model: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Memory Allocated after instantiating model: 4 GB


In [4]:
optimizer = torch.optim.AdamW(model.parameters())

In [5]:
filename = '../datasets/full_ds.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,submission_id,comment_id,comment,width,height,image_url,image_path
0,amnyr9,efnhj58,Sometimes the line between life and suicide is...,1960,4032,https://i.redd.it/s2fi2wr7ibe21.jpg,./images/amnyr9.jpg
1,amnyr9,efngbk8,"Your support is amazing, I feel like I can act...",1960,4032,https://i.redd.it/s2fi2wr7ibe21.jpg,./images/amnyr9.jpg
2,amnyr9,efnhf5p,As a Russian are you obligated to dress as the...,1960,4032,https://i.redd.it/s2fi2wr7ibe21.jpg,./images/amnyr9.jpg
3,amnyr9,efngwcp,You're a perfectly valid person and we care ab...,1960,4032,https://i.redd.it/s2fi2wr7ibe21.jpg,./images/amnyr9.jpg
4,amnyr9,efngp00,"I had the same haircut once, I was about 1s old",1960,4032,https://i.redd.it/s2fi2wr7ibe21.jpg,./images/amnyr9.jpg


In [6]:
epochs = 1
prompt = "Question: How would you describe this person? Answer:"
labels = []
image = None
submission_id = None
output = None

for index, row in df.iterrows():
    comment = row['comment']
    image_url = row['image_url']
    
    if submission_id is None:
        submission_id = row['submission_id']

    if image is None:
        image = Image.open(requests.get(image_url, stream=True).raw)

    # We reached a new submission
    if row['submission_id'] != submission_id or len(labels) >5:
        # Expand the inputs
        prompts = [prompt] * len(labels)
        image_inputs = [image] * len(labels)
        
        # Forward Pass 
        print(f'Forward pass {len(prompts)} prompts and {len(labels)} labels')
        label_input_ids = processor.tokenizer(labels, return_tensors="pt", padding="max_length", max_length=256).input_ids
        inputs = processor(image_inputs, prompts, return_tensors="pt", padding="max_length", max_length=256).to(0, torch.float16)
        print(label_input_ids.shape)
        print(inputs.input_ids.shape)
        print(f'Memory Allocated after processing input: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')
        out = model(pixel_values=inputs.pixel_values, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, labels=label_input_ids)
        print(f'Memory Allocated after foward pass: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')

        # TODO: Backprop
        out.loss.backward()
        print(f'Memory Allocated after calculating gradients: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')
        optimizer.step()
        print(f'Memory Allocated after updating gradients: {torch.cuda.memory_allocated(0)/1e9:.4g} GB')
        print(f"Loss at step {index} = {loss}")
        break

        # Reset
        submission_id = row['submission_id']
        image = Image.open(requests.get(row['image_url'], stream=True).raw)
        labels = [row['comment']]
    else:
        labels.append(comment)

Forward pass 6 prompts and 6 labels
torch.Size([6, 256])
torch.Size([6, 256])
Memory Allocated after processing input: 4.001 GB
Memory Allocated after foward pass: 17.3 GB
Memory Allocated after calculating gradients: 10.29 GB
Memory Allocated after updating gradients: 10.84 GB


NameError: name 'idx' is not defined

In [4]:
print(f'Torch Version: {torch.__version__}')
print(f'Torch available: {torch.cuda.is_available()}')
print(f'CUDA version: {torch.version.cuda}')
print(f'CUDNN Version: {torch.backends.cudnn.version()}')
print(f'CUDNN Available: {torch.backends.cudnn.is_available()}')

Torch Version: 2.2.0+cu121
Torch available: True
CUDA version: 12.1
CUDNN Version: 8900
CUDNN Available: True


In [13]:
# This Works
x = torch.randn(1, 3, 224, 224).cuda()
conv = torch.nn.Conv2d(3, 3, 3).cuda()
out = conv(x)
print(out)
torch.backends.cudnn.version()

tensor([[[[-0.2258,  0.5908,  0.5395,  ..., -0.4635, -0.0105,  1.0169],
          [-0.0187,  0.9595,  0.0972,  ...,  0.1228,  0.2397,  0.9744],
          [ 1.2488,  0.2947, -0.5481,  ..., -0.9220, -0.3484, -0.1615],
          ...,
          [-0.5578,  0.1989, -0.7338,  ..., -0.4295, -0.6536,  0.0914],
          [ 0.0763,  0.5592, -0.6916,  ...,  0.6397,  0.8285,  0.4860],
          [ 0.1623, -0.1973, -1.0511,  ...,  0.6243,  0.3905,  0.0339]],

         [[ 0.1889, -0.2638,  0.1338,  ...,  0.5188, -0.2363,  0.5212],
          [ 0.1145, -0.3007,  0.5670,  ...,  0.4922,  0.7508, -1.2253],
          [ 0.6015,  0.1288, -0.6906,  ..., -0.1090, -0.0872, -0.2802],
          ...,
          [-0.2399,  0.3944,  1.0016,  ...,  0.4502,  0.2785,  0.2913],
          [ 0.5198, -1.0184,  0.0961,  ..., -0.1031,  1.1517, -0.1195],
          [-0.5453, -0.1497,  1.2029,  ..., -0.1827,  0.6232,  0.2847]],

         [[ 0.6258,  0.2314,  0.1132,  ...,  0.2032, -1.3933, -1.0281],
          [-0.3209, -1.3799, -

8900

In [4]:
torch.__path__

['/home/devonperoutky/.cache/pypoetry/virtualenvs/coffeebot-p3lKt8zM-py3.10/lib/python3.10/site-packages/torch']