In [1]:
from PIL import Image
import torch
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from transformers import AutoImageProcessor, Swinv2ForImageClassification
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
import pandas as pd

os.environ['HF_HOME'] = os.getcwd()
os.environ['TRANSFORMERS_CACHE'] = os.getcwd()
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'
os.environ['HIP_VISIBLE_DEVICES'] = '0'

In [2]:
seed = 42
if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [3]:
class WaterbirdDataset(Dataset):
    def __init__(self, df, root_dir, image_processor):
        self.root_dir = root_dir
        self.df = df
        self.image_processor = image_processor
    
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        img_filename = self.df.iloc[idx]['img_filename']
        img = Image.open(os.path.join(self.root_dir, img_filename)).convert("RGB")
        
        inputs = self.image_processor(img, return_tensors="pt")
        
        label = self.df.iloc[idx]['y']
        place = self.df.iloc[idx]['place']
            
        return img_filename, inputs.pixel_values, label, place

In [4]:
model_name = "microsoft/swinv2-base-patch4-window16-256"
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = Swinv2ForImageClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
model = model.to(DEVICE)

Some weights of Swinv2ForImageClassification were not initialized from the model checkpoint at microsoft/swinv2-base-patch4-window16-256 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.load_state_dict(torch.load("/media/atiqur/Extra/Download/Waterbird/Swin-base/200_sample_1/swin_base_patch4_window16_256_augmented_200_sample_1.pth", weights_only=True, map_location=DEVICE))

<All keys matched successfully>

In [6]:
class HuggingfaceToTensorModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super(HuggingfaceToTensorModelWrapper, self).__init__()
        self.model = model

    def forward(self, x):
        return self.model(x).logits

In [7]:
df = pd.read_csv('./datasets/waterbird/metadata.csv')
test_df = df[df['split'] == 2].reset_index(inplace=False)

root_dir = './datasets/waterbird'
test_set = WaterbirdDataset(df=test_df, root_dir=root_dir, image_processor=image_processor)

In [8]:
target_layers  = [model.swinv2.layernorm]

def reshape_transform(tensor, height=8, width=8):
    result = tensor.reshape(tensor.size(0),
        height, width, tensor.size(2))

    # Bring the channels to the first dimension,
    # like in CNNs.
    result = result.transpose(2, 3).transpose(1, 2)
    return result


cam = GradCAM(model=HuggingfaceToTensorModelWrapper(model=model), target_layers=target_layers, reshape_transform=reshape_transform)

In [9]:
metadata = {
    'img_filename': [],
    'cam_filename': [],
    'y': [],
    'place': [],
    'prediction': []
}

saving_dir = '/media/atiqur/Extra/Download/Waterbird/Swin-base/Grad-Cam'

for i in tqdm(range(test_set.__len__())):
    torch.cuda.empty_cache()
    img_filename, inp, label, place = test_set.__getitem__(i)
    
    with torch.no_grad():
        logits = model(inp.to(DEVICE)).logits
        pred = torch.argmax(logits, dim=1).cpu().item()

    inp = inp.to(DEVICE)
    target = [ClassifierOutputTarget(label)]
    
    rgb_img = Image.open(os.path.join(root_dir, img_filename)).convert('RGB').resize((256, 256))
    rgb_img = np.float32(rgb_img) / 255

    cam_out = cam(input_tensor=inp, targets=target)
    cam_out = cam_out[0, :]
    visualization = show_cam_on_image(rgb_img, cam_out, use_rgb=True)

    cam_filename = f'{i}_{img_filename.split('/')[-1]}'
    metadata['img_filename'].append(img_filename)
    metadata['cam_filename'].append(cam_filename)
    metadata['y'].append(label)
    metadata['place'].append(place)
    metadata['prediction'].append(pred)

    img = Image.fromarray(visualization)
    img.save(os.path.join(saving_dir, cam_filename))
    # break

  0%|          | 0/5794 [00:00<?, ?it/s]

In [10]:
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv(os.path.join(saving_dir, 'metadata.csv'), index=False)