# Image Feature Pair Extract - CLIP, ResNet18. 
conda activate clip


clip_image_features_list (118287, 512)
target_image_features_list (118287, 512)
clip_image_features_list (5000, 512)
target_image_features_list (5000, 512)

Feature extraction complete in 6m 16s

In [1]:
import numpy as np
import torch
import pickle
import time
print("Torch version:", torch.__version__)

assert torch.__version__.split(".") >= ["1", "7", "1"], "PyTorch 1.7.1 or later is required"

import os
import matplotlib.pyplot as plt
from collections import OrderedDict
import torch

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Torch version: 1.7.1


# Load CLIP

In [2]:
import clip

clip.available_models()

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']

In [3]:
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

clip_model = model

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [4]:
# what is the final learned temperature? 
# np.log(1 / tau) = logit_scale
# 1 / tau = np.exp(logit_scale)
# tau = 1/np.exp(logit_scale)
logit_scale = clip_model.logit_scale.detach().cpu().item()
print('logit_scale', logit_scale)
print('temperature', 1/np.exp(logit_scale))
print('1/temperature', np.exp(logit_scale))

logit_scale 4.605170249938965
temperature 0.009999999360491285
1/temperature 100.00000639508755


In [5]:
type(preprocess)

torchvision.transforms.transforms.Compose

# Load Data

In [6]:
import torchvision
from torch.utils.data import DataLoader


coco_val_dataset = torchvision.datasets.ImageFolder(
                        root = './dummy_val',
                        transform=preprocess,
                        )

In [7]:
coco_val_dataloader = DataLoader(coco_val_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)

# Extractor loop


In [8]:
since = time.time()
dataloaders = {
    'val': coco_val_dataloader,
}
# Each epoch has a training and validation phase
for phase in ['val',]:

    clip_model.eval()   # Set model to evaluate mode, for extraction
    ##################################
    # Fields to be stored for postprocessing 
    ##################################
    clip_image_features_list = []

    # Iterate over data.
    for inputs, captions in dataloaders[phase]:
        image_input = inputs.cuda(non_blocking=True)
        text_input = captions.cuda(non_blocking=True)
        # TODO: add text here
        
        with torch.set_grad_enabled(False):
            clip_image_features = clip_model.encode_image(image_input).float()

            ##################################
            # Evaluation book-keeping Field 
            ##################################
            clip_image_features_list.append( clip_image_features.cpu().numpy() )

    ##################################
    # Evaluation book-keeping Field 
    ##################################
    clip_image_features_list     = np.concatenate( clip_image_features_list, axis=0)
    print('clip_image_features_list', clip_image_features_list.shape)

    dump_result_dict = {
        "clip_image_features_list":   clip_image_features_list, 
        }
    with open(os.path.join('features', 'feature_dump_{}.pkl'.format(phase) ), "wb") as pkl_file:
        pickle.dump(
            dump_result_dict, 
            pkl_file, 
        )

print()

time_elapsed = time.time() - since
print('Feature Extraction completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

clip_image_features_list (10954, 512)

Feature Extraction completed in 0m 12s
