In [67]:
from __future__ import print_function

import os
import time
import glob
import random
import zipfile
from itertools import chain

import timm
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torchvision import models
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from torchvision.transforms import functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

from LATransformer.model import ClassBlock, LATransformer, LATransformerTest
from LATransformer.utils import save_network, update_summary, get_id
from LATransformer.metrics import rank1, rank5, rank10, calc_map

os.environ['CUDA_VISIBLE_DEVICES']='0'
device = 'cuda'

## Config Parameters

In [68]:
batch_size = 8
gamma = 0.7
seed = 42

## Load Model

In [72]:
# Load ViT
vit_base = timm.create_model('vit_small_patch16_224', pretrained=True, num_classes=751)
vit_base= vit_base.to(device)

# Create La-Transformer
model = LATransformerTest(vit_base, lmbd=8).to(device)

# Load LA-Transformer
name = "occluded_duke_wo_augm"
save_path = os.path.join('./weights',name,'model_state_dict_small.pth')
model.load_state_dict(torch.load(save_path), strict=False)
model.eval()

LATransformerTest(
  (model): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU()
          (d

# Calculation of GFLOPs

In [77]:
import torch
import torchvision.models as models
from thop import profile

# Load the pretrained ResNet-50 model
#model = models.resnet101(pretrained=True)

# Set the input size (224x224) and the batch size
input_size = 224
batch_size = 1

# Create a random input tensor with the appropriate size
input_tensor = torch.randn(batch_size, 3, input_size, input_size).to(device)

# Calculate the number of FLOPs and parameters in the model
flops, params = profile(model, inputs=(input_tensor,))
print("Parameters: ",params)
# Convert FLOPs to GFLOPS (billions of floating-point operations per second)
gflops = flops / 1e9

print(f"Total FLOPs: {flops:.2f} FLOPs")
print(f"GFLOPS: {gflops:.2f} GFLOPS")

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.normalization.LayerNorm'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register count_adap_avgpool() for <class 'torch.nn.modules.pooling.AdaptiveAvgPool2d'>.
Parameters:  21589632.0
Total FLOPs: 12745222272.00 FLOPs
GFLOPS: 12.75 GFLOPS


In [56]:
import torch
import torchvision.transforms as transforms
from torchvision.models import vision_transformer

In [59]:
from PIL import Image

# Load and preprocess image
image_path = 'input.jpg'
img = Image.open(image_path).convert('RGB')

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

input_img = transform(img).unsqueeze(0).to(device)  # Add batch dimension


In [60]:
import torch.nn.functional as F

with torch.no_grad():
    # Get attention map
    outputs = model(input_img)
    #print(outputs)
    attention_weights = outputs[0]  # List of attention weights from different layers


In [61]:
import matplotlib.pyplot as plt

layer_index = 5  # Choose the layer index
head_index = 0  # Choose the head index

attention_map = attention_weights[layer_index][0][head_index]  # Extract attention map

plt.imshow(attention_map, cmap='hot', interpolation='nearest')
plt.title(f'Attention Map - Layer {layer_index}, Head {head_index}')
plt.colorbar()
plt.show()


IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number

In [86]:
from vit_pytorch import visualize_attention_map

ImportError: cannot import name 'visualize_attention_map' from 'vit_pytorch' (C:\Users\KMU\anaconda3\envs\reid\lib\site-packages\vit_pytorch\__init__.py)

In [83]:
import torch
import torchvision.transforms as transforms
from transformers import ViTImageProcessor, ViTForImageClassification
from captum.attr import LayerGradCam, visualization
import matplotlib.pyplot as plt

In [85]:
# Load pre-trained ViT model
model_name = "google/vit-base-patch16-224-in21k"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

# Load and preprocess image
image_path = 'input.jpg'
img = Image.open(image_path).convert('RGB')

input_img = feature_extractor(images=img, return_tensors="pt")

# Forward Pass
with torch.no_grad():
    outputs = model(**input_img)
    logits = outputs.logits

# Choose a layer for GradCAM visualization
layer_index = 5  # Choose the layer index

# GradCAM
gradcam = LayerGradCam(model, model.base_model.encoder.blocks[layer_index])
attr = gradcam.attribute(input_img, target=0)

# Visualize GradCAM attribute as heatmap
heatmap = visualization.ImageGrayscale(overlay=True)(attr)
plt.imshow(heatmap.squeeze(), cmap='viridis')
plt.axis('off')
plt.title(f'Attention Map - Layer {layer_index}')
plt.show()


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'ViTEncoder' object has no attribute 'blocks'

In [66]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import cv2
from PIL import Image
import requests
#from functools import reduce

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
pretrained_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', output_attentions=True)

# white shark (reference : https://kids.nationalgeographic.com/animals/fish/facts/great-white-shark)
url1 = 'https://i.natgeofe.com/k/d21630fa-3ab9-4e37-adea-c503629e49d4/great_white_smile.jpg'
# pelican (reference : https://www.dw.com/en/theres-more-to-the-pelican-than-a-pouch/g-50613921)
url2 = 'https://static.dw.com/image/50486701_303.jpg'
# tiger (reference : https://www.worldwildlife.org/species/tiger)
url3 = 'https://c402277.ssl.cf1.rackcdn.com/photos/18127/images/story_full_width/Medium_WW251528.jpg?1574444517' 


for url in [url1, url2, url3]:

  image = Image.open(requests.get(url, stream=True).raw)

  inputs = feature_extractor(images=image, return_tensors="pt")
  outputs = pretrained_model(**inputs)
  logits = outputs.logits
  attentions = outputs.attentions
  
  # model predicts one of the 1000 ImageNet classes
  predicted_class_idx = logits.argmax(-1).item()
  print("Predicted class:", pretrained_model.config.id2label[predicted_class_idx])

  # To account for residual connections, we add an identity matrix to the
  # attention matrix and re-normalize the weights.

  att_mat = torch.stack(attentions).squeeze(1)

  # attention 평균
  att_mat = reduce(att_mat, 'b h len1 len2 -> b len1 len2', 'mean')
  im = np.array(image)

  residual_att = torch.eye(att_mat.size(1))
  aug_att_mat = att_mat + residual_att
  aug_att_mat = aug_att_mat / aug_att_mat.sum(dim=-1).unsqueeze(-1)

  # Recursively multiply the weight matrices
  joint_attentions = torch.zeros(aug_att_mat.size())
  joint_attentions[0] = aug_att_mat[0]

  for n in range(1, aug_att_mat.size(0)):
      joint_attentions[n] = torch.matmul(aug_att_mat[n], joint_attentions[n-1])
      
  # Attention from the output token to the input space.
  v = joint_attentions[-1]
  grid_size = int(np.sqrt(aug_att_mat.size(-1)))
  mask = v[0, 1:].reshape(grid_size, grid_size).detach().numpy()
  mask = cv2.resize(mask / mask.max(), (im.shape[1], im.shape[0]))[..., np.newaxis]
  result = (mask * im).astype("uint8")

  fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(16, 16))

  ax1.set_title('Original')
  ax2.set_title('Attention Mask')
  ax3.set_title('Attention Map')
  _ = ax1.imshow(im)
  _ = ax2.imshow(mask.squeeze())
  _ = ax3.imshow(result)

Predicted class: great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias


TypeError: 'Tensor' object is not callable

# Attention map

In [8]:
# Load image
img = Image.open('input.jpg').to(device)

# Preprocess image
img = F.resize(img, (224, 224))
img = F.to_tensor(img)
img = F.normalize(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# Forward pass
output = model(img.unsqueeze(0))

# Visualize attention map
attention_map = output[-1]
attention_map = torch.mean(attention_map, dim=1)
attention_map = torch.nn.functional.interpolate(attention_map.unsqueeze(1), size=(224, 224), mode='bilinear', align_corners=False)
attention_map = attention_map.squeeze(1).cpu().detach().numpy()

plt.imshow(attention_map)

AttributeError: to



### DataLoader

In [4]:
transform_query_list = [
    transforms.Resize((224,224), interpolation=3),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.RandomErasing(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
transform_gallery_list = [
    transforms.Resize(size=(224,224),interpolation=3), #Image.BICUBIC
    transforms.ToTensor(),
    transforms.RandomErasing(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
data_transforms = {
'query': transforms.Compose( transform_query_list ),
'gallery': transforms.Compose(transform_gallery_list),
}



In [5]:
image_datasets = {}
data_dir = "C:/Users/KMU/tensor/data/market/clean_data/"

image_datasets['query'] = datasets.ImageFolder(os.path.join(data_dir, 'query'),
                                          data_transforms['query'])
image_datasets['gallery'] = datasets.ImageFolder(os.path.join(data_dir, 'gallery'),
                                          data_transforms['gallery'])
query_loader = DataLoader(dataset = image_datasets['query'], batch_size=batch_size, shuffle=False )
gallery_loader = DataLoader(dataset = image_datasets['gallery'], batch_size=batch_size, shuffle=False)

class_names = image_datasets['query'].classes
print(len(class_names))

750


###  Extract Features

In [6]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [7]:
def extract_feature(model,dataloaders):
    
    features =  torch.FloatTensor()
    count = 0
    idx = 0
    for data in tqdm(dataloaders):
        img, label = data
        img, label = img.to(device), label.to(device)

        output = model(img)

        n, c, h, w = img.size()
        
        count += n
        features = torch.cat((features, output.detach().cpu()), 0)
        idx += 1
    return features

In [8]:
# Extract Query Features
query_feature= extract_feature(model, query_loader)

# Extract Gallery Features
gallery_feature = extract_feature(model, gallery_loader)

  0%|          | 0/421 [00:00<?, ?it/s]

  0%|          | 0/1990 [00:00<?, ?it/s]

In [9]:
# Retrieve labels
gallery_path = image_datasets['gallery'].imgs
query_path = image_datasets['query'].imgs

gallery_cam,gallery_label = get_id(gallery_path)
query_cam,query_label = get_id(query_path)

## Concat Averaged GELTs

In [10]:
concatenated_query_vectors = []
for query in tqdm(query_feature):
   
    fnorm = torch.norm(query, p=2, dim=1, keepdim=True)*np.sqrt(14)
   
    query_norm = query.div(fnorm.expand_as(query))
    
    concatenated_query_vectors.append(query_norm.view((-1))) # 14*768 -> 10752

concatenated_gallery_vectors = []
for gallery in tqdm(gallery_feature):
   
    fnorm = torch.norm(gallery, p=2, dim=1, keepdim=True) *np.sqrt(14)
   
    gallery_norm = gallery.div(fnorm.expand_as(gallery))
    
    concatenated_gallery_vectors.append(gallery_norm.view((-1))) # 14*768 -> 10752
  

  0%|          | 0/3368 [00:00<?, ?it/s]

  0%|          | 0/15913 [00:00<?, ?it/s]

## Calculate Similarity using FAISS

In [11]:
import faiss
import numpy as np

In [12]:
#index = faiss.IndexIDMap(faiss.IndexFlatIP(10752))

index = faiss.IndexIDMap(faiss.IndexFlatIP(10752))
index2 = faiss.IndexIDMap(index)


encoded_data = np.asarray([t.numpy() for t in concatenated_gallery_vectors]).astype('float32')
print(encoded_data.shape)
ids = np.array(gallery_label)
print(ids.shape)
ids = np.asarray(ids.astype('int64'))
print(ids.shape)
#"IDMap,Flat"
index.add_with_ids(encoded_data,ids)
#print("encoded_data:", ids)

(15913, 2688)
(15913,)
(15913,)


AssertionError: 

In [14]:
#np.asarray(encoded_data).astype('float32')
def search(query: str, k=1):
    encoded_query = query.unsqueeze(dim=0).numpy()
    top_k = index.search(encoded_query, k)
    return top_k

In [15]:
rank1_score = 0
rank5_score = 0
rank10_score = 0
ap = 0
count = 0
for query, label in zip(concatenated_query_vectors, query_label):
    count += 1
    label = label
    output = search(query, k=10)
    rank1_score += rank1(label, output) 
    rank5_score += rank5(label, output) 
    rank10_score += rank10(label, output) 
    print("Correct: {}, Total: {}, Incorrect: {}".format(rank1_score, count, count-rank1_score), end="\r")
    ap += calc_map(label, output)

print("Rank1: {}, Rank5: {}, Rank10: {}, mAP: {}".format(rank1_score/len(query_feature), 
                                                         rank5_score/len(query_feature), 
                                                         rank10_score/len(query_feature), ap/len(query_feature)))    

AssertionError: 