In [None]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [1]:
import torch 
import gc

torch.cuda.empty_cache()
gc.collect()
torch.cuda.is_available()

True

In [2]:
import numpy as np
from pkg_resources import packaging

print("Torch version:", torch.__version__)

Torch version: 2.2.1


In [3]:
import clip

clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
# model.cpu().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [5]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x000001A838D76040>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

## Setting up input images and texts

In [11]:
import os
import pandas as pd
from PIL import Image,ImageFile
import numpy as np
from torchvision import transforms
import pickle

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

images_dir = 'E:/Pitt/Spring 2024/CS 2002/wikiart/random_samples'
descriptions_csv = 'E:/Pitt/Spring 2024/CS 2002/artemis_official_data/official_data/artemis_dataset_release_v0.csv'

painting_style = os.path.basename(images_dir)

# Load the textual descriptions from a CSV file into a dictionary
descriptions_df = pd.read_csv(descriptions_csv)


In [7]:
ARTEMIS_EMOTIONS = ['amusement',
 'awe',
 'contentment',
 'excitement',
 'anger',
 'disgust',
 'fear',
 'sadness',
 'something else']

emotions = ["A picture making me feel " + emotion for emotion in ARTEMIS_EMOTIONS]

In [8]:
# Check total number of images in the directory
total_images = sum(1 for file in os.listdir(images_dir) if file.endswith(('.png', '.jpg', '.jpeg')))
print(total_images)

5235


In [14]:
import matplotlib.pyplot as plt
import random
#original_images = []
processed_images = []
#plt.figure(figsize=(16,5))

# Retrieve the filenames and filter out non-image files and images without descriptions
image_filenames = [filename for filename in os.listdir(images_dir) if filename.endswith(".png") or filename.endswith(".jpg")]

#image_filenames = [filename for filename in image_filenames if os.path.splitext(filename)[0] in descriptions]
image_filenames_ext = []
# Allow PIL to load truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

for i, filename in enumerate(image_filenames):
    name = os.path.splitext(filename)[0]
    image_filenames_ext.append(name)

    try:
        image = Image.open(os.path.join(images_dir, filename)).convert("RGB")
        image_filenames_ext.append(name)
        #original_images.append(image)
        processed_images.append(preprocess(image))
    except IOError as e:
            print("Unable to load image:", e)

    #original_images.append(image)


#print(len(processed_images))
del image_filenames


In [15]:
print(len(processed_images))
#print(len(texts))
print(len(descriptions_df))

5235
454684


## Building Features

In [16]:
image_input = torch.tensor(np.stack(processed_images)).cuda()
text_tokens = clip.tokenize(emotions).cuda()

print (image_input.shape)
print (text_tokens.shape)

torch.Size([5235, 3, 224, 224])
torch.Size([9, 77])


In [17]:
# Memory Error
del processed_images
gc.collect() 

0

In [None]:
with torch.no_grad(): #Encoding
    image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()

In [None]:
del image_input
gc.collect()

In [None]:
#normalize features

image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

## ZERO SHOT Classification

In [None]:
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
top_probs, top_labels = text_probs.cpu().topk(5, dim=-1)