# Interacting with CLIP

This is a self-contained notebook that shows how to download and run CLIP models, calculate the similarity between arbitrary image and text inputs, and perform zero-shot image classifications.

In [None]:
import os

def get_free_path(path):
  i = 0
  
  while os.path.exists(path):
      i += 1
      path = path + ("_%02d" % i)
  return path

In [None]:
print("_%02d" % 2)


# Parameters

In [None]:
ipfs_path = "/content/ipfs"

experiment_name = "tiergarten_3"

experiment_path = get_free_path(f"{ipfs_path}/video_to_clip/{experiment_name}")

results_path = f"{experiment_path}/results"

top_classes=7

# The words only take effect if cifar is False (otherwise the 100 CIFAR categories will be used)
cifar = False
#words = ["Happiness", "Excitement", "Boredom", "Sadness","Anger", "Disgust", "Fear"] 
#words = ["baby","bed","bicycle","bottle","bowl","boy","bridge","bus","can","castle","chair","clock","cloud","couch","cup","girl","house","keyboard","lamp","man","motorcycle","mountain","pickup_truck","plate","road","rocket","skyscraper","streetcar","table","tank","telephone","television","tractor","train","wardrobe","woman"]
words = ["compassionate","sympathetic","dreamy","bittersweet","calm","relaxing","serene","entrancing","triumphant","heroic","energizing","romantic","loving","transcendent","mystical","awe-inspiring","amazing","eerie","mysterious","joyful","cheerful","erotic","euphoric","ecstatic","indignant","defiant","proud","strong","sad","depressing","tender","longing"]
print("Experiment Path:",experiment_path)

# Preparation for Colab


Make sure you're running a GPU runtime; if not, select "GPU" as the hardware accelerator in Runtime > Change Runtime Type in the menu. The next cells will print the CUDA version of the runtime if it has a GPU, and install PyTorch 1.7.1.

In [None]:
!mkdir -p $results_path
%cd /content
!git clone https://github.com/voodoohop/pollinations.git
%cd /content/pollinations/app
!git pull
!git checkout dev
!git pull
!npm install
!npm install -g
%cd -
#!npm install forever -g

In [None]:
IPFS_ROOT = "/content/ipfs"
DATA_PATH = IPFS_ROOT+"/tiergarten/frames_8fps"
!echo "Num input images:" `ls -l {DATA_PATH}/*.jpg {DATA_PATH}/*.png 2>/dev/null | wc -l`

In [None]:
from glob import glob
for file in glob(DATA_PATH+"/*.png"):
  print(file)
  !mogrify -resize 512 $file 

In [None]:
!DEBUG=* pollinate -p $IPFS_ROOT -s --once --ipns
#!echo /ipns/pollinations.ai | DEBUG=* pollinate -p $IPFS_ROOT -r --ipns--once


  [31;1mcontentCache [0mPersisting 1793 cached blocks. [31m+51ms[0m
  [32;1mipfsConnector [0madded QmVoRFWU7zpg8mQubTknugGpBCA36h4vMrogKbkVTELmC6 size Object [AsyncGenerator] {} [32m+128ms[0m
  [32;1mipfsConnector [0mcopying to /thomashmac5968/tiergarten/frames_8fps/frame_0813.png [32m+0ms[0m
  [31;1mcontentCache [0mAdding QmTNhKvmtEaCRRT5Yebr4pfhqignxAtpSizKd6gpcVXPJy to cache. [31m+42ms[0m
  [31;1mcontentCache [0mAdding QmabkzLEQUuY6UdhhWarqkCiozdMVKijjzNHxpekw3rqaV to cache. [31m+5ms[0m
  [31;1mcontentCache [0mAdding QmYaxGZabNni8PRgBMwDrduqDnrPxQvfQic3EieD9xy8xi to cache. [31m+18ms[0m
  [32;1mipfsConnector [0madded QmQG1EVRmaCHQdqfGAEDbPeMdANQsAFdPfys1WmfNdHGGQ size Object [AsyncGenerator] {} [32m+42ms[0m
  [32;1mipfsConnector [0mcopying to /thomashmac5968/tiergarten/frames_8fps/frame_0812.png [32m+0ms[0m
  [32;1mipfsConnector [0madded QmXd6ELiqKugk5xcJ78UbCn82Ku27561Tcj2myzGyzBYxp size Object [AsyncGenerator] {} [32m+5ms[0m
  [32;1mipf

In [None]:
#!mkdir -p /content/ipfs/tiergarten/frames_8fps/valence_ourasl
#!ffmpeg -i /content/ipfs/tiergarten-long.mp4 -r 8 /content/ipfs/tiergarten/frames_8fps/frame_%04d.png

In [None]:
#!ffmpeg -i "/content/ipfs/tiergarten/frames_8fps/"%*.png /content/valenceArousel.mp4


In [None]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

In [None]:
! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex tqdm

In [None]:
import numpy as np
import torch

print("Torch version:", torch.__version__)

# Downloading the model

CLIP models are distributed as TorchScript modules.

In [None]:
MODELS = {
    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",    
}

In [None]:
! wget {MODELS["ViT-B/32"]} -O model.pt

In [None]:
model = torch.jit.load("model.pt").cuda().eval()
input_resolution = model.input_resolution.item()
context_length = model.context_length.item()
vocab_size = model.vocab_size.item()

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

# Image Preprocessing

We resize the input images and center-crop them to conform with the image resolution that the model expects. Before doing so, we will normalize the pixel intensity using the dataset mean and standard deviation.



In [None]:
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image

preprocess = Compose([
    Resize(input_resolution, interpolation=Image.BICUBIC),
    CenterCrop(input_resolution),
    ToTensor()
])

image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda()
image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda()

# Text Preprocessing

We use a case-insensitive tokenizer. The tokenizer code is hidden in the second cell below

In [None]:
! pip install ftfy regex
! wget https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz -O bpe_simple_vocab_16e6.txt.gz

In [None]:
#@title

import gzip
import html
import os
from functools import lru_cache

import ftfy
import regex as re


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class SimpleTokenizer(object):
    def __init__(self, bpe_path: str = "bpe_simple_vocab_16e6.txt.gz"):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
        merges = merges[1:49152-256-2+1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v+'</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
        return text


# Setting up input images and texts

We are going to feed 8 example images and their textual descriptions to the model, and compare the similarity between the corresponding features.

The tokenizer is case-insensitive, and we can freely give any suitable textual descriptions.

In [None]:
import os
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

%matplotlib inline
%config InlineBackend.figure_format = 'retina'



In [None]:

from tqdm.notebook import tqdm_notebook
images = []
#texts = [
plt.figure(figsize=(16, 16))
images_path = "/content/ipfs/tiergarten/frames_8fps"

filenames = os.listdir(images_path)
filenames.sort()
cols = 6
filenames = [filename for filename in filenames if filename.endswith(".png") or filename.endswith(".jpg")]
print("Preprocessing images...", len(filenames))
for filename in tqdm_notebook(filenames):
    i = len(images)
    #prop = i / len(images)
    name = os.path.splitext(filename)[0]

    try :
      image = preprocess(Image.open(os.path.join(images_path, filename)).convert("RGB"))
      images.append(image)
    
      #if i < cols*cols:
        
      #print(name)
      if i<cols*cols // 2:
        plt.subplot(6, 6, i+1)
        plt.imshow(image.permute(1, 2, 0))
        plt.title(f"{filename}\n")
        plt.xticks([])
        plt.yticks([])

        plt.tight_layout()
      
    except :
      print("Error reading image", name, "Deleting")
      !rm -v $filename

## Building features

We normalize the images, tokenize each text input, and run the forward pass of the model to get the image and text features.



*   List item
*   List item



In [None]:
if cifar:
  from torchvision.datasets import CIFAR100
  words = CIFAR100(os.path.expanduser("~/.cache"), transform=preprocess, download=True).classes

text_descriptions = [f"The image is an abstract symbolization of {label.lower()}" for label in words]


In [None]:
image_input = torch.tensor(np.stack(images)).cuda()
image_input -= image_mean[:, None, None]
image_input /= image_std[:, None, None]

In [None]:
text_tokens = [tokenizer.encode(desc) for desc in text_descriptions]

In [None]:
text_input = torch.zeros(len(text_tokens), model.context_length, dtype=torch.long)
sot_token = tokenizer.encoder['<|startoftext|>']
eot_token = tokenizer.encoder['<|endoftext|>']

for i, tokens in enumerate(text_tokens):
    tokens = [sot_token] + tokens + [eot_token]
    text_input[i, :len(tokens)] = torch.tensor(tokens)

text_input = text_input.cuda()

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    #text_features = model.encode_text(text_input).float()

## Calculating cosine similarity

We normalize the features and calculate the dot product of each pair.

# Zero-Shot Image Classification

You can classify images using the cosine similarity (times 100) as the logits to the softmax operation.

In [None]:
def get_probs(text_input):
  with torch.no_grad():
      text_features = model.encode_text(text_input).float()
      text_features /= text_features.norm(dim=-1, keepdim=True)

  text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
  return text_probs
  
text_probs = get_probs(text_input)

In [None]:
from scipy import ndimage

def moving_average(x, w):
    return ndimage.convolve(x, np.ones((w,1))) / w

def smooth_text_probs(text_probs):
  text_probs_cpu = text_probs.cpu()
  #print(moving_average(text_probs_cpu,5).shape)

  text_probs_smoothed = moving_average(text_probs.cpu(),100)
  # a Gaussian filter with a standard deviation of 10
  #gauss = ndimage.gaussian_filter1d(img, 10, 1)
  #text_probs_smoothed[0].shape

  text_probs_smoothed = (text_probs_smoothed / np.amax(text_probs_smoothed, axis=0))

  text_probs = text_probs_smoothed
  return text_probs_smoothed

_, top_index = torch.from_numpy(text_probs.cpu().numpy().mean(0)).cpu().topk(top_classes, dim=-1)



In [None]:
text_probs_top = text_probs[:,top_index]


In [None]:

top_probs, top_labels = text_probs_top.cpu().topk(top_classes, dim=-1)
top_probs = smooth_text_probs(top_probs)



In [None]:

print("\n".join(words))

#for index in top_labels:
#  print(words[index])
#print(top_probs.shape, top_labels.shape)
print(top_probs)

In [None]:
text_probs.shape


In [None]:


plt.clf()
plt.figure(figsize=(10,5))
plt.plot(text_probs_cpu[:,:4])
plt.plot(text_probs_smoothed[:,:4])
plt.show()
print(results_path)


In [None]:

from IPython.display import clear_output
from tqdm.notebook import tqdm

print(len(images))
image_no = 0

plt.style.use('dark_background')
#!rm -rv $results_path
!mkdir -p $results_path
#!mkdir -p /content/ipfs/tiergarten/frames_8fps/valence_arousal/
def plot_img(index):
  clear_output()
  plt.ioff()
  image = images[index]
  top_label = top_labels[index]
  top_prop = top_probs[index]
  plt.figure(figsize=(8, 4))
  plt.subplot(1, 2, 1)
 
  img = image.permute(1, 2, 0)
  plt.imshow(img)
 
  plt.axis("off")
  plt.subplot(1, 2, 2)
  y = np.arange(top_probs.shape[-1])
  plt.grid()
  plt.barh(y, top_probs[index])  
  plt.gca().invert_yaxis()
  plt.gca().set_axisbelow(True)
  plt.yticks(y, [words[index] for index in top_label.numpy()])
  plt.xlabel("probability")
 
  #plt.subplots_adjust(wspace=0.5)

  plt.tight_layout()
  plt.savefig(f"{results_path}/{experiment_name}_{'{:04}'.format(index)}.png")
  plt.close()
  #plt.show()
 
 
for i, image in enumerate(tqdm_notebook(images)):
 
    plot_img(i)
 
 
# plt.subplots_adjust(wspace=0.5)
# plt.show()

In [None]:
top_labels

In [None]:

import matplotlib
import matplotlib.pyplot as plt

import matplotlib.animation as animation


fps = 30
nSeconds = 5
snapshots = [ np.random.rand(5,5) for _ in range( nSeconds * fps ) ]

# First set up the figure, the axis, and the plot element we want to animate
fig = plt.figure( figsize=(8,8) )

a = snapshots[0]
im = plt.imshow(a, interpolation='none', aspect='auto', vmin=0, vmax=1)

def animate_func(i):
    if i % fps == 0:
        print( '.', end ='' )

    im.set_array(snapshots[i])
    return [im]

anim = animation.FuncAnimation(
                               fig, 
                               animate_func, 
                               frames = nSeconds * fps,
                               interval = 1000 / fps, # in ms
                               )

anim.save('test_anim.mp4', fps=fps, extra_args=['-vcodec', 'libx264'])

print('Done!')


In [None]:
!ffmpeg -i $results_path/%*.png $results_path/output.mp4
!mv $results_path/*.png /tmp
