In [7]:
import os
import requests
import json

import clip
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers.optimization import get_linear_schedule_with_warmup

import numpy as np
import pandas as pd
from PIL import Image
import skimage.io as io
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from pycocotools.coco import COCO

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
model_name = "ViT-L/14"
clip_model, preprocess = clip.load(model_name, device=device) # output dim of 768
clip_model.to(device).eval();

In [8]:
def batch_encode(image_dir: str, output_dir: str = "../coco_embs", batch_size: int = 64) -> None:
    image_tensors = []

    for i in tqdm(range(0, len(os.listdir(image_dir)), batch_size)):
        images = []

        with torch.no_grad():
            for filename in os.listdir(image_dir)[i:i+batch_size]: 
                images.append(preprocess(Image.open(os.path.join(image_dir, filename))).unsqueeze(0))

            image_tensors.append(clip_model.encode_image(torch.cat(images, dim=0).to(device)).detach().cpu())

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    np.save(
        f"{output_dir}/coco_train2017_clip_{model_name.replace('/', '-')}_embs.npy", 
        torch.cat(image_tensors, dim=0).numpy(), 
        allow_pickle=True
    )

In [9]:
batch_encode(image_dir="../train2017", batch_size=512)

  0%|          | 0/232 [00:00<?, ?it/s]

In [12]:
with torch.no_grad():
    torch.cuda.empty_cache()