<a href="https://colab.research.google.com/github/nikhil9302/supreme-CLIP/blob/development/data/embed_CLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing the clip module from OpenAI/CLIP repository

In [None]:
! pip install git+https://github.com/openai/CLIP.git

# The same extraction and manipulation of datasets followed in datasets.ipynb

In [None]:
import pandas as pd
import glob
from torchvision.datasets.utils import download_url
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import requests
import urllib.request
from io import BytesIO

In [None]:
dataset_url = "https://unsplash.com/data/lite/latest"
download_url(dataset_url, '.')

Downloading https://unsplash-datasets.s3.amazonaws.com/lite/latest/unsplash-research-dataset-lite-latest.zip to ./latest


  0%|          | 0/632351052 [00:00<?, ?it/s]

In [None]:
!unzip '/content/latest' -d '/content'

In [None]:
path = './'
documents = ['photos', 'keywords']
datasets = {}

for doc in documents:
  files = glob.glob(path + doc + ".tsv*")

  subsets = []
  for filename in files:
    df = pd.read_csv(filename, sep='\t', header=0)
    subsets.append(df)

  datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [None]:
url = []
photoID = []
for i in range(0,25000): 
   url.append(datasets['photos'].photo_image_url[i])
   photoID.append(datasets['photos'].photo_id[i])

### Here binary search(BinSearch) is used instead of the built-in index() function for better execution time and optimization

In [None]:
# takes 28 to 30 sec
from bisect import bisect_left
def BinSearch(a, x):
   i = bisect_left(a, x)
   if i != len(a) and a[i] == x:
      return i
   else:
      return -1

key_photoID = list(datasets['keywords'].photo_id)
key_photoID.sort()
photoID_tag = {new_list: [] for new_list in range(25000)}

for i in range(0,25000):
    ind = BinSearch(key_photoID, photoID[i])
    while(ind!=-1 and ind!=len(key_photoID) and key_photoID[ind]==photoID[i]):
      photoID_tag[i].append(datasets['keywords'].keyword[ind])
      ind =ind + 1

# To extract the tensor from the CLIP model and add it to a dictionary with our other extracted attributes from the datasets


In [None]:
import torch
import clip
from torch.utils.data import Dataset, DataLoader

class DatasetFeeder(Dataset):
    def __init__(self, url, preprocess, model, photoID):
        super().__init__()
        self.url = url
        self.preprocess = preprocess 

    def __len__(self):
        return len(self.url)

    def __getitem__(self, idx):
        if self.preprocess is not None:
          try: 
            image = self.preprocess(
                Image.open(
                    BytesIO(requests.get(url[idx]).content)
                    )
            ).unsqueeze(0).to(device)
          except:
            return -1  
        with torch.no_grad():
            image_features = model.encode_image(image) 
        return {           
            "image_url": url[idx],
            "image_features" : image_features,
            "tags": photoID_tag[idx],
        }

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

dataset_feeder = DatasetFeeder(url, preprocess, model, photoID_tag)

parent_dict = {}
from tqdm import tqdm
for i,j in zip(tqdm(dataset_feeder),range(25000)):
  if(i!=-1):
    parent_dict[photoID[j]] = i




100%|██████████| 25000/25000 [7:59:27<00:00,  1.15s/it]


### Since we want data.json file for the NoSQL datbase, we should convert all of the non JSON serializable Objects to serializable Objects i.e we have to convert the tensor of image_features to a list. Since numpy is faster, we first convert tensor to numpy array

In [None]:
import numpy
for i in parent_dict:
    parent_dict[i]['image_features'] = parent_dict[i]['image_features'].cpu().numpy()        

### The parent_dict pickled as "data.p" for later use
Since the extraction of clip embeddings and preprocess takes time (About 7hrs), So to not lose the parent_dict extracted(with clip embeddings and other features) , it was pickled and then saved to my GDrive as a public file

In [None]:
import pickle
parent_dict = []
pickle.dump(parent_dict, open( "data.p", "wb" ) )

# Downloading the data.p file from Gdrive
### If you just want to extract the data.json file then just execute these sections of codes and ignore the rest of the above colab notebook

In [None]:
!gdown --id 1jqNk4GOD4vGS7kKj03CXCMZpVkFD0Wlp

Downloading...
From: https://drive.google.com/uc?id=1jqNk4GOD4vGS7kKj03CXCMZpVkFD0Wlp
To: /content/data.p
70.3MB [00:00, 151MB/s] 


## To load the pickle "data.p" back to parent_dict

In [None]:
parent_dict = pickle.load(open( "data.p", "rb" )) 

In [None]:
parent_dict['--2IBUMom1I']['image_features'][0]

## The dictionary extracted has parent_dict[i]['image_features'][0] is an numpy array, which should be a list for it to be JSON serializable

In [None]:
for i in parent_dict:
    parent_dict[i]['image_features'][0] = list(parent_dict[i]['image_features'][0])
    img_features = parent_dict[i]['image_features'][0] 
    for j in range(len(img_features)):
      parent_dict[i]['image_features'][0][j] = str(img_features[j])

In [None]:
parent_dict

### Now as the entire parent_dict is JSON serializable, we can use json.dump to load parent_dict dictionary as a JSON file in "data.json"

In [None]:
import json
json_object = json.dumps(parent_dict, indent = 4) 
with open("data.json", "w") as outfile:
    json.dump(parent_dict, outfile,indent = 4)

Now you can see data.json in your colab root directory, you can download it on your PC or store it in your cloud or database