Dans ce notebook nous testons les permissions et possibilités d'accès aux différentes composantes du modèle blip-image-captioning-base et nous testons aussi les  RBMs

In [1]:
from transformers import BlipProcessor, BlipModel
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

In [None]:
# modèle chargé depuis le répertoire local
save_directory = "./models_saved/blip-image-captioning-base_local"

model = BlipForConditionalGeneration.from_pretrained(save_directory)
processor = BlipProcessor.from_pretrained(save_directory)

Encodeur d’image ViT (Vision Transformer)

In [4]:
# Chargement de l'image locale
image_path = "images/sunset.jpg"  
image = Image.open(image_path).convert("RGB")  

# Prétraitement de l'image
inputs = processor(images=image, return_tensors="pt")

# Récupérer les embeddings visuels (sortie du ViT)
with torch.no_grad():
    outputs = model.vision_model(**inputs)
    vit_embeddings = outputs.last_hidden_state  # (batch_size, n_patches, hidden_dim)

print(vit_embeddings.shape)  # Exemple : torch.Size([1, 197, 768]) pour ViT-B/16

torch.Size([1, 577, 768])


In [5]:
print(model.text_decoder)

BlipTextLMHeadModel(
  (bert): BlipTextModel(
    (embeddings): BlipTextEmbeddings(
      (word_embeddings): Embedding(30524, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BlipTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x BlipTextLayer(
          (attention): BlipTextAttention(
            (self): BlipTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BlipTextSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

Entrainement du RBM

In [None]:
# Simulons des embeddings (100 images, embeddings de 768 dimensions)
visible_dim = 768
hidden_dim = 512
rbm = RBM(visible_dim, hidden_dim)

embeddings = torch.randn(100, visible_dim)  # Fake embeddings

# Entraînement sur 10 epochs
for epoch in range(10):
    for v in embeddings:
        rbm.contrastive_divergence(v.unsqueeze(0), k=1, lr=0.01)
    print(f"Epoch {epoch + 1} terminée")


Insertion dans Blip

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

# Charger BLIP
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def refine_embeddings(image_tensor):
    """Pipeline encodeur + RBM"""
    encoder_outputs = model.vision_model(image_tensor)  # Extraire les features
    embeddings = encoder_outputs.last_hidden_state  # Embeddings bruts
    refined_embeddings = rbm(embeddings)  # Passage dans la RBM
    return refined_embeddings

def generate_caption(image_tensor):
    refined_embeddings = refine_embeddings(image_tensor)
    
    # Générer la caption avec les embeddings affinés
    generated_ids = model.text_decoder.generate(inputs_embeds=refined_embeddings)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return caption
