In [374]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities


In [55]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    return trainable_params, all_param
    
_,basic_model_params_num =  print_trainable_parameters(model)

trainable params: 151277313 || all params: 151277313 || trainable%: 100.00


In [56]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4,
    lora_alpha=16, # alpha/r 이다. 가중치 조정하는 하이퍼 파라미터라고 생각하면된다. 
    # 내생각에는 rank가 2처럼 작은경우 조금더 높은값으로 조정하기 위해있는게 아닐까 추측한다.
    
    # transformer의 clip에서는 아래와 같이 이름이 붙여져있다.
    target_modules=["q_proj", "v_proj","k_proj"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["encoder"],
)

lora_model = get_peft_model(model, config)
_,lora_model_params_num=print_trainable_parameters(lora_model)

print(lora_model_params_num/basic_model_params_num)

trainable params: 246503424 || all params: 274897665 || trainable%: 89.67
1.8171770740005146


In [50]:
for name,params in lora_model.named_parameters():
    print(name)
    print(params.shape)
    break

base_model.model.logit_scale
torch.Size([])


In [57]:
for name,params in lora_model.named_parameters():
    if params.requires_grad==True:
        print(name)


base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.bias
base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.bias
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.q_proj.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.q_proj.bias
base_model.model.text_model.encoder.original_module.layers.0.self_attn.q_proj.lora_A.default.wei

In [59]:
import loralib as lora
# This sets requires_grad to False for all parameters without the string "lora_" in their names
lora.mark_only_lora_as_trainable(lora_model)

for name,params in lora_model.named_parameters():
    if params.requires_grad==True:
        print(name)



base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.q_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.0.self_attn.q_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.1.self_attn.k_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.1.self_attn.k_proj.lora_B.default.weight
base_model.model.text_model.encoder.original_module.layers.1.self_attn.v_proj.lora_A.default.weight
base_model.model.text_model.encoder.original_module.layers.1.self_attn.v_proj.lora_B.default.weight


In [60]:
_,lora_model_params_num=print_trainable_parameters(lora_model)

print(lora_model_params_num/basic_model_params_num)

trainable params: 737280 || all params: 274897665 || trainable%: 0.27
1.8171770740005146


In [64]:
import loralib as lora
# This sets requires_grad to False for all parameters without the string "lora_" in their names
lora.mark_only_lora_as_trainable(lora_model)

for name,params in lora_model.named_parameters():
    if params.requires_grad==True:
        print(name)
        print(params)



base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_A.default.weight
Parameter containing:
tensor([[-0.0146, -0.0266, -0.0164,  ...,  0.0242, -0.0045,  0.0321],
        [ 0.0251,  0.0359,  0.0075,  ..., -0.0364,  0.0329, -0.0271],
        [ 0.0325, -0.0404, -0.0305,  ...,  0.0002,  0.0399, -0.0355],
        [ 0.0407,  0.0359, -0.0138,  ..., -0.0386,  0.0224,  0.0214]],
       requires_grad=True)
base_model.model.text_model.encoder.original_module.layers.0.self_attn.k_proj.lora_B.default.weight
Parameter containing:
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], requires_grad=True)
base_model.model.text_model.encoder.original_module.layers.0.self_attn.v_proj.lora_A.default.weight
Parameter containing:
tensor([[ 0.0178, -0.0020,  0.0006,  ..., -0.0027,  0.0026, -0.0029],
        [ 0.0242, -0.0399,  0.0102,  ...,  0.0096, -0.0311, -0.027

In [297]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [323]:
from transformers import AutoTokenizer, CLIPTextModelWithProjection

model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

outputs = model(**inputs)
text_embeds = outputs.text_embeds
text_embeds.shape,outputs.last_hidden_state.shape

(torch.Size([2, 512]), torch.Size([2, 7, 512]))

In [351]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPVisionModel

model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# inputs = processor(['dog lives in a house which lives cats'],images=image, return_tensors="pt")
inputs = processor( images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output  # pooled CLS states

pooled_output.shape


torch.Size([1, 768])

In [312]:
# example = torch.rand(1, 3, 224, 224)
import torch
# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, **inputs)
# model(**inputs)

TypeError: trace() got an unexpected keyword argument 'input_ids'

In [79]:
inputs = processor(text=["a photo of a cat and a photo of a dog is in my house in the morning"], images=image, return_tensors="pt", padding=True)
inputs['input_ids'].shape,inputs['attention_mask'].shape,inputs['pixel_values'].shape

(torch.Size([1, 20]), torch.Size([1, 20]), torch.Size([1, 3, 224, 224]))

In [243]:
from transformers import AutoTokenizer, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
text_features = model.get_text_features(**inputs)

text_features.shape

torch.Size([2, 512])

In [282]:
x = torch.randn(1,3,224,224)
x = torch.randn(1,3,337,337)
model.vision_model.embeddings.patch_embedding(x).shape
model.vision_model.embeddings.patch_embedding
model.vision_model.embeddings.position_embedding

Embedding(50, 768)

In [259]:
x = inputs['input_ids'][0]
x


model.text_model.embeddings.token_embedding(x).shape,model.text_model.embeddings.position_embedding.weight.shape



(torch.Size([7, 512]), torch.Size([77, 512]))

In [206]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)

##  model encapsulation test

In [199]:
import torch
import torch.nn as nn
class Layer(nn.Module):
    def __init__(self) -> None:
        super(Layer,self).__init__()
        self.l1 = nn.Linear(4,16)
        self.l2 = nn.Linear(16,4)
    
    def forward(self,x):
        x = self.l1(x)
        x = self.l2(x)
        return x

class Mymodel(Layer):
    def __init__(self) -> None:
        # super(Mymodel,self).__init__()
        super(Layer,self).__init__()
    
        self.block1 = Layer()
        # self.fc2 = nn.Linear(4,2)
    
    def forward(self, x):
        # return super().forward(x)
        # x = self.l1(x)
        # x = self.l2(x)
        x = self.block1(x)
        # x = self.fc2(x)
        return x

class clcl(Mymodel):
    def __init__(self):
        super(Mymodel,self).__init__()
        # super(clcl,self).__init__()
        self.visual_encoder0 = Mymodel()
        
    def froward(self,x):
        x = self.visual_encoder0(x)
        return x

x = torch.randn(4)
mode = Layer()

y = mode(x)

mode2 = Mymodel()

y2 = mode2(x)

mode3 = clcl()
# y3 = mode3(x)
for name,params in mode3.named_parameters():
    print(name)
    print(params.shape)

l1.weight
torch.Size([16, 4])
l1.bias
torch.Size([16])
l2.weight
torch.Size([4, 16])
l2.bias
torch.Size([4])
visual_encoder0.block1.l1.weight
torch.Size([16, 4])
visual_encoder0.block1.l1.bias
torch.Size([16])
visual_encoder0.block1.l2.weight
torch.Size([4, 16])
visual_encoder0.block1.l2.bias
torch.Size([4])


In [230]:
x = torch.randn(49408)

x = torch.randint(0,3,(1,49408))
print(x.shape)
model.text_model.embeddings.token_embedding(x).shape

torch.Size([1, 49408])


torch.Size([1, 49408, 512])

In [368]:
model.vision_model.encoder(y[1]).last_hidden_state.shape



torch.Size([13, 50, 768])

In [369]:
x = torch.randn(13,3,224,224) #, 128 * 6
y = model.vision_model(x)
y[0].shape,y[1].shape

# xx = torch.randn(1,512)
# y = model.text_model(xx)

# model.text_model.encoder
# model.vision_model.encoder


(torch.Size([13, 50, 768]), torch.Size([13, 768]))

In [101]:
# 구조가 어떻게되어있는지 파헤쳐볼 필요가 있다. 유튜브 블로그 찾아보자.

for name,_ in model.named_parameters():
    print(name)


logit_scale
text_model.embeddings.token_embedding.weight
text_model.embeddings.position_embedding.weight
text_model.encoder.layers.0.self_attn.k_proj.weight
text_model.encoder.layers.0.self_attn.k_proj.bias
text_model.encoder.layers.0.self_attn.v_proj.weight
text_model.encoder.layers.0.self_attn.v_proj.bias
text_model.encoder.layers.0.self_attn.q_proj.weight
text_model.encoder.layers.0.self_attn.q_proj.bias
text_model.encoder.layers.0.self_attn.out_proj.weight
text_model.encoder.layers.0.self_attn.out_proj.bias
text_model.encoder.layers.0.layer_norm1.weight
text_model.encoder.layers.0.layer_norm1.bias
text_model.encoder.layers.0.mlp.fc1.weight
text_model.encoder.layers.0.mlp.fc1.bias
text_model.encoder.layers.0.mlp.fc2.weight
text_model.encoder.layers.0.mlp.fc2.bias
text_model.encoder.layers.0.layer_norm2.weight
text_model.encoder.layers.0.layer_norm2.bias
text_model.encoder.layers.1.self_attn.k_proj.weight
text_model.encoder.layers.1.self_attn.k_proj.bias
text_model.encoder.layers.1.s

In [381]:


# model.get_image_features(**inputs)
# model.get__features(**inputs)

In [27]:
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

logits_per_image

tensor([[24.5701, 19.3049]], grad_fn=<TBackward0>)