In [1]:
from google.colab import files
uploaded = files.upload()  # Upload kaggle.json here


Saving kaggle.json to kaggle.json


In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!mkdir -p /content/data
!kaggle datasets download -d vikashrajluhaniwal/fashion-images -p /content/data
!unzip -q "/content/data/*.zip" -d /content/data/fashion-images


Dataset URL: https://www.kaggle.com/datasets/vikashrajluhaniwal/fashion-images
License(s): CC0-1.0
Downloading fashion-images.zip to /content/data
 85% 283M/335M [00:00<00:00, 876MB/s] 
100% 335M/335M [00:00<00:00, 810MB/s]


In [4]:
!pip install -q kaggle transformers datasets torch torchvision pillow faiss-cpu gradio pandas


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor, get_linear_schedule_with_warmup


In [23]:
LR = 1e-5
NUM_EPOCHS = 5
WARMUP_RATIO = 0.05
BATCH_SIZE = 64   # GPU-friendly
BASE_DIR = "/content/data/fashion-images/data"


In [7]:
df = pd.read_csv("/content/data/fashion-images/data/fashion.csv")
# Build absolute paths
df["image_path"] = df.apply(
    lambda row: os.path.join(
        BASE_DIR,
        row["Category"],
        row["Gender"],
        "Images",
        "images_with_product_ids",
        row["Image"]
    ),
    axis=1
)

print(df.head())


   ProductId Gender Category SubCategory ProductType Colour   Usage  \
0      42419  Girls  Apparel     Topwear        Tops  White  Casual   
1      34009  Girls  Apparel     Topwear        Tops  Black  Casual   
2      40143  Girls  Apparel     Topwear        Tops   Blue  Casual   
3      23623  Girls  Apparel     Topwear        Tops   Pink  Casual   
4      47154  Girls  Apparel  Bottomwear      Capris  Black  Casual   

                                  ProductTitle      Image  \
0           Gini and Jony Girls Knit White Top  42419.jpg   
1                Gini and Jony Girls Black Top  34009.jpg   
2  Gini and Jony Girls Pretty Blossom Blue Top  40143.jpg   
3   Doodle Kids Girls Pink I love Shopping Top  23623.jpg   
4             Gini and Jony Girls Black Capris  47154.jpg   

                                            ImageURL  \
0  http://assets.myntassets.com/v1/images/style/p...   
1  http://assets.myntassets.com/v1/images/style/p...   
2  http://assets.myntassets.com/v1/ima

In [8]:
df = df[df["image_path"].apply(os.path.exists)].reset_index(drop=True)
print("Images available:", len(df))


Images available: 2906


In [9]:
df["caption"] = (
    "Gender: " + df["Gender"] + "; "
    "Color: " + df["Colour"] + "; "
    "Category: " + df["Category"] + "; "
    "ProductType: " + df["ProductType"] + "; "
    "SubCategory: " + df["SubCategory"] + "; "
    "Usage: " + df["Usage"]
)


In [10]:
class FashionDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row["image_path"]).convert("RGB")
        txt = row["caption"]
        return {"image": img, "text": txt}


In [11]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def collate_fn(batch):
    images = [b["image"] for b in batch]
    texts  = [b["text"]  for b in batch]

    encoding = processor(
        images=images,
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    return encoding


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [12]:
!pip install imagehash


Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [13]:
import imagehash
from PIL import Image
def compute_phash(img_path):
    try:
        img = Image.open(img_path).convert("RGB")
        return str(imagehash.phash(img))
    except:
        return None

df["phash"] = df["image_path"].apply(compute_phash)
df = df.drop_duplicates(subset=["phash"]).reset_index(drop=True)
print("Unique images after pHash:", len(df))

Unique images after pHash: 2481


In [14]:
from sklearn.model_selection import train_test_split


# ======================================================
# 80/20 TRAIN–TEST SPLIT
# ======================================================
train_df, test_df = train_test_split(df,test_size=0.2,random_state=42,shuffle=True)

print("Train samples:", len(train_df))
print("Test samples:", len(test_df))

train_dataset = FashionDataset(train_df)
test_dataset  = FashionDataset(test_df)

train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_fn)



Train samples: 1984
Test samples: 497


In [15]:
use_cuda = torch.cuda.is_available()
DEVICE = torch.device("cuda" if use_cuda else "cpu")

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
for p in model.vision_model.parameters():
    p.requires_grad = False

print("Using:", DEVICE)


pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using: cuda


In [16]:
test = processor(text=["hello"], return_tensors="pt").to(DEVICE)
out = model.get_text_features(**test)
print("NaN in output:", torch.isnan(out).any())
print("Output sample:", out[0][:5])



NaN in output: tensor(False, device='cuda:0')
Output sample: tensor([ 0.2896,  0.0030, -0.0799,  0.2973, -0.1597], device='cuda:0',
       grad_fn=<SliceBackward0>)


In [17]:
for p in model.vision_model.parameters():
    p.requires_grad = False


In [18]:
trainable_params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.AdamW(trainable_params, lr=LR)
num_steps = NUM_EPOCHS * len(train_loader)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    int(num_steps * WARMUP_RATIO),
    num_steps
)


In [21]:
from tqdm.auto import tqdm
import time

MAX_GRAD_NORM = 1.0

def train_epoch(epoch):
    model.train()
    total_loss = 0.0
    start = time.time()

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} TRAIN"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}   # <<< FIX
        out = model(**batch, return_loss=True)
        loss = out.loss
        if torch.isnan(loss):
            print(" NaN detected — abort epoch")
            return
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(trainable_params, MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    avg = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} avg loss: {avg:.4f}")

@torch.no_grad()
def validate():
    model.eval()
    total_loss = 0.0

    for batch in tqdm(test_loader, desc="Test"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        out = model(**batch, return_loss=True)
        total_loss += out.loss.item()

    avg = total_loss / len(test_loader)
    print(f"Test Loss: {avg:.4f}")
    return avg



In [24]:
for epoch in range(NUM_EPOCHS):
    train_epoch(epoch)
    validate()


Epoch 1 TRAIN:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch 1 avg loss: 1.1019


Test:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 1.2498


Epoch 2 TRAIN:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch 2 avg loss: 1.0561


Test:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 1.2366


Epoch 3 TRAIN:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch 3 avg loss: 0.9972


Test:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 1.2442


Epoch 4 TRAIN:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch 4 avg loss: 0.9397


Test:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 1.2274


Epoch 5 TRAIN:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch 5 avg loss: 0.9196


Test:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 1.2333


In [25]:
from torch.utils.data import DataLoader
@torch.no_grad()
def compute_image_embeddings(model, processor, df):
    model.eval()
    all_embs = []
    paths = df["image_path"].tolist()
    loader = DataLoader(paths, batch_size=32, shuffle=False)
    for batch in loader:
        imgs = [Image.open(p).convert("RGB") for p in batch]
        enc = processor(images=imgs, return_tensors="pt").to(DEVICE)
        # NO .half() here
        feats = model.get_image_features(**enc)
        # ensure FP32 and normalize
        feats = feats.float()
        feats = feats / feats.norm(dim=-1, keepdim=True)
        all_embs.append(feats.cpu().numpy())
    return np.vstack(all_embs)
image_embs = compute_image_embeddings(model, processor, df)
print("image_embs:", image_embs.shape)


image_embs: (2481, 512)


In [26]:
@torch.no_grad()
def compute_text_embeddings(model, processor, df):
    model.eval()
    all_embs = []
    caps = df["caption"].tolist()
    loader = DataLoader(caps, batch_size=32, shuffle=False)
    for batch in loader:
        enc = processor(
            text=batch,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)
        feats = model.get_text_features(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"])
        feats = feats.float()
        feats = feats / feats.norm(dim=-1, keepdim=True)
        all_embs.append(feats.cpu().numpy())
    return np.vstack(all_embs)
text_embs = compute_text_embeddings(model, processor, df)
print("text_embs:", text_embs.shape)


text_embs: (2481, 512)


In [27]:
print("NaN in image_embs:", np.isnan(image_embs).any())
print("NaN in text_embs :", np.isnan(text_embs).any())


NaN in image_embs: False
NaN in text_embs : False


In [28]:
np.save("image_embs.npy", image_embs)
np.save("text_embs.npy", text_embs)
df.to_csv("metadata.csv", index=False)


In [29]:
!pip install -q faiss-cpu gradio


In [30]:
import faiss
import numpy as np

image_embs_f32 = image_embs.astype("float32")
D = image_embs_f32.shape[1]
index = faiss.IndexFlatIP(D)
index.add(image_embs_f32)
print("FAISS index size:", index.ntotal)




FAISS index size: 2481


In [41]:
from typing import List
from PIL import Image

@torch.no_grad()
@torch.no_grad()
def encode_text_query(query: str) -> np.ndarray:
    model.eval()
    enc = processor(
        text=[query],
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    feats = model.get_text_features(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"])
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu().numpy().astype("float32")



def search_text_to_image(query: str, top_k: int = 5):
    q = encode_text_query(query)
    scores, idxs = index.search(q, top_k * 10)  # fetch more

    seen = set()
    results = []

    for score, idx in zip(scores[0], idxs[0]):
        row = df.iloc[idx]
        img_hash = row["phash"]

        if img_hash in seen:
            continue

        seen.add(img_hash)
        results.append((row["image_path"], row["caption"], float(score)))

        if len(results) == top_k:
            break

    return results


# quick test
results = search_text_to_image("women red sneakers", top_k=5)
for p, t, s in results:
    print(s, "->", t, "|", p)


0.29276740550994873 -> Gender: Men; Color: Red; Category: Footwear; ProductType: Casual Shoes; SubCategory: Shoes; Usage: Casual | /content/data/fashion-images/data/Footwear/Men/Images/images_with_product_ids/39742.jpg
0.2835327982902527 -> Gender: Women; Color: Red; Category: Footwear; ProductType: Casual Shoes; SubCategory: Shoes; Usage: Casual | /content/data/fashion-images/data/Footwear/Women/Images/images_with_product_ids/13555.jpg
0.28059205412864685 -> Gender: Men; Color: Red; Category: Footwear; ProductType: Sports Shoes; SubCategory: Shoes; Usage: Sports | /content/data/fashion-images/data/Footwear/Men/Images/images_with_product_ids/8970.jpg
0.2804362177848816 -> Gender: Men; Color: Orange; Category: Footwear; ProductType: Casual Shoes; SubCategory: Shoes; Usage: Casual | /content/data/fashion-images/data/Footwear/Men/Images/images_with_product_ids/20895.jpg
0.27982744574546814 -> Gender: Men; Color: Red; Category: Footwear; ProductType: Casual Shoes; SubCategory: Shoes; Usage

In [33]:
# Cosine similarity matrix (N x N)
sims = text_embs @ image_embs.T   # both are normalized
print("Similarity matrix:", sims.shape)


Similarity matrix: (2481, 2481)


In [34]:
import numpy as np

# ==========================
# 1. Similarity Matrix
# ==========================
sims = text_embs @ image_embs.T   # (N x N)
print("Similarity matrix computed:", sims.shape)


# ==========================
# 2. Helper Functions
# ==========================
def compute_ranks(sim_matrix):
    N = sim_matrix.shape[0]
    ranks = []

    for i in range(N):
        order = np.argsort(-sim_matrix[i])   # descending
        rank = int(np.where(order == i)[0][0])
        ranks.append(rank)

    return np.array(ranks)


def recall_at_k(ranks, k):
    return np.mean(ranks < k)


def mean_average_precision(ranks):
    # Only one relevant item → AP = 1/(rank+1)
    ap = 1.0 / (ranks + 1)
    return float(ap.mean())


# ==========================
# 3. Compute All Metrics
# ==========================
ranks = compute_ranks(sims)

R1  = recall_at_k(ranks, 1)
R5  = recall_at_k(ranks, 5)
R10 = recall_at_k(ranks, 10)

MedR = np.median(ranks)
MnR  = np.mean(ranks)
mAP  = mean_average_precision(ranks)


# ==========================
# 4. PRINT METRIC REPORT
# ==========================


print(f"Total samples evaluated: {len(ranks)}\n")

print(f" Recall@1   : {R1:.4f}")
print(f" Recall@5   : {R5:.4f}")
print(f" Recall@10  : {R10:.4f}\n")
print(f" Median Rank: {MedR:.2f}")
print(f" Mean Rank  : {MnR:.2f}\n")
print(f" mAP        : {mAP:.4f}")



Similarity matrix computed: (2481, 2481)
Total samples evaluated: 2481

 Recall@1   : 0.0758
 Recall@5   : 0.2507
 Recall@10  : 0.3841

 Median Rank: 15.00
 Mean Rank  : 32.94

 mAP        : 0.1748


In [35]:
def gradio_search(query: str, top_k: int = 5):
    res = search_text_to_image(query, top_k=top_k)
    images = [r[0] for r in res]  # image paths
    captions = [f"{r[1]} (score={r[2]:.3f})" for r in res]
    return images, "\n\n".join(captions)


In [48]:
@torch.no_grad()
def encode_single_image(path: str) -> np.ndarray:
    img = Image.open(path).convert("RGB")
    enc = processor(images=[img], return_tensors="pt").to(DEVICE)
    if use_cuda:
        enc["pixel_values"] = enc["pixel_values"].half()
    feats = model.get_image_features(**enc)
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu().numpy().astype("float32")


def search_text_to_image(query: str, top_k):
    q = encode_text_query(query)
    scores, idxs = index.search(q, top_k * 3)  # fetch more

    seen = set()
    results = []

    for score, idx in zip(scores[0], idxs[0]):
        row = df.iloc[idx]
        img_path = row["image_path"]

        if img_path in seen:
            continue  # skip duplicates

        seen.add(img_path)
        results.append((img_path, row["caption"], float(score)))

        if len(results) >= top_k:
            break

    return results


In [49]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("##  Fashion CLIP Search (Fine-tuned)")

    with gr.Row():
        query = gr.Textbox(label="Text query", value="girls pink top", lines=1)
        topk  = gr.Slider(label="Top K",value=5, step=1)

    search_btn = gr.Button("Search")

    # Fixed gallery (no .style())
    gallery = gr.Gallery(
        label="Results",
        columns=5,
        height="auto"
    )

    captions_box = gr.Textbox(
        label="Captions & Scores",
        lines=10
    )

    search_btn.click(
        fn=gradio_search,
        inputs=[query, topk],
        outputs=[gallery, captions_box]
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bb10954a0a42dbcf73.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


AttributeError: module 'gradio' has no attribute 'blocks'