In [2]:
#!pip install git+https://github.com/openai/CLIP.git

In [11]:
import torch
from tqdm import tqdm
import os
import clip
from PIL import Image
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
#from google.colab import drive

#loading CLIP MODEL and preprocessing function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device = device)

In [12]:
DATA_PATH = '/home/saurav/Documents'

csv_file = DATA_PATH+'/required_dataset/styles2.csv'
df = pd.read_csv(csv_file)

In [13]:
#defining image paths and files
image_folder = DATA_PATH+'/required_dataset/images'
embeddings = {}
batch_size = 8
total_rows = len(df)
checkpoint_file = "clip_embeddings_checkpoint.pkl"
final_embeddings = "clip_embeddings_batch.pkl"

In [14]:
#function to combine text features from multiple columns
def create_text_description(row):
  columns = [
      str(row['gender']),
      str(row['masterCategory']),
      str(row['subCategory']),
      str(row['articleType']),
      str(row['baseColour']),
      str(row['season']),
      str(row['year']),
      str(row['usage']),
      str(row['productDisplayName'])
  ]
  #concatinating all relevant columns into single description
  #as join only supports string so converted all column values to str to avoid null and integer data types
  return ' '.join(columns)

In [15]:
#load the checkpoint if exists
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        embeddings = pickle.load(f)
    processed_ids = set(embeddings.keys())  
else:
    embeddings = {}
    processed_ids = set()

In [16]:
output_file = "clip_embeddings_checkpoint.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(embeddings, f)

In [17]:
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        embeddings = pickle.load(f)
    processed_ids = set(embeddings.keys())  

In [18]:
#FOR THE EXTRACTION OF TEXT AND IMAGE EMBEDDING

# Process the data in batches
for start_idx in tqdm(range(0, total_rows, batch_size)):
    end_idx = min(start_idx + batch_size, total_rows)
    batch = df.iloc[start_idx:end_idx]

    #filter out rows whose IDs have already been processed
    batch = batch[~batch['id'].astype(str).isin(processed_ids)]

    # Batch processing text descriptions
    text_descriptions = [create_text_description(row) for _, row in batch.iterrows()]
    text_inputs = clip.tokenize(text_descriptions).to(device)

    with torch.no_grad():
        text_embeddings = model.encode_text(text_inputs).cpu().numpy()

    # Batch processing images
    image_embeddings = []
    for _, row in batch.iterrows():
        image_id = str(row['id'])
        image_path = os.path.join(image_folder, f"{image_id}.jpg")

        if os.path.exists(image_path):
            image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
            with torch.no_grad():
                image_embedding = model.encode_image(image).cpu().numpy()
            image_embeddings.append(image_embedding)
        else:
            print(f"Image {image_id}.jpg not found, skipping image embedding.")
            image_embeddings.append(None)

    # Store embeddings for the current batch
    for i, (_,row) in enumerate(batch.iterrows()):
        image_id = str(row['id'])
        embeddings[image_id] = {
            "text_embedding": text_embeddings[i],
            "image_embedding": image_embeddings[i]
        }
        processed_ids.add(image_id)   #mark the image ID as processed

# Save embeddings to a pickle file
output_file = "clip_embeddings_batch.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(embeddings, f)

print(f"Embeddings have been saved to {output_file}")

100%|████████████████████████████████████████████████████| 5556/5556 [02:12<00:00, 41.87it/s]


Embeddings have been saved to clip_embeddings_batch.pkl
