In [1]:
import os 
from PIL import Image
import requests
import torch
import numpy as np
from transformers import (
    Blip2Processor,
    Blip2VisionConfig,
    Blip2QFormerConfig,
    OPTConfig,
    Blip2Config,
    Blip2ForConditionalGeneration,
)

# Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration
#configuration = Blip2Config()

# Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
#model = Blip2ForConditionalGeneration(configuration)

# Accessing the model configuration
#configuration = model.config

# We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig

# Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations

#vision_config = Blip2VisionConfig()
#qformer_config = Blip2QFormerConfig()
#text_config = OPTConfig()

#config = Blip2Config.from_vision_qformer_text_configs(vision_config, qformer_config, text_config)

In [2]:
# Common for visual question answering and all
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
)

model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0): Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
        )
        (1): Blip2EncoderLayer(
          (self_attn): 

In [49]:
# Image captioning without prompt
img = Image.open("franka.jpg")
image = img

inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)

generated_ids = model.generate(**inputs,max_new_tokens = 200, min_new_tokens=0)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

print(generated_text)

a robot is working on a table with wires and other equipment


In [90]:
# VQA answering (with prompt obviously)
import time

start_t = time.time()
img = Image.open("lab.jpg")
image = img

prompt = "Question: Is the cup half empty or half full? Answer:"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)

generated_ids = model.generate(**inputs, max_new_tokens = 100, do_sample = False)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
end_t = time.time()
print(end_t-start_t)
print(generated_text)

0.26520681381225586
it depends on the cup


In [97]:
# Just using the LLM (only prompt, no image)
prompt = "Continue the story: The macedonian girl was drinking coffe nex "

input_ids = processor.tokenizer(prompt, return_tensors="pt").input_ids.to(device)
out_ids = model.language_model.generate(input_ids, max_new_tokens = 1000,min_length = 500, do_sample = True)

generated_text = processor.tokenizer.batch_decode(out_ids)[0].strip()
print(generated_text)

</s>Continue the story: The macedonian girl was drinking coffe nex  Her: what are you doing, i want to have a sleep?   Him: i wanted to talk to you. And that is why I woke up at 2 am and didn't go back to sleep.   And here's the twist. He woke up at 2 am so you sleep and not your friend (or macedonian girl) who is still asleep.  It'll be your last dream. And they'll be waking up at 6am for school. Hehe   Sleep tight boy. Good night. 👋😂😊😍😘😭😥😕😬😩💜💙😘😢💚😢💜💙💙💙💙💙😞😞😞😞😞😞😞😞😞😞😞😞😞😔😔😔😢😢😢😢😢😢😢😢😢😢😢👊👏👏👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍👍🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐀🐝🐐🐘😢🐐</s>Does anyone know of any ways in order to help keep me cool down during the summer?
