In [5]:
import json
import os

with open('../data/category.json', 'r') as f:
    classes = json.load(f)

labels_dir = 'labels/test'

for file in os.listdir(labels_dir):
    if file.endswith('.txt'):
        file_path = os.path.join(labels_dir, file)

        with open(file_path, 'r') as f:
            lines = f.readlines()

            filtered_lines = [
                line for line in lines
                if line.strip() and int(line.split()[0]) in classes
            ]

        with open(file_path, 'w') as f:
            f.writelines(filtered_lines)


In [4]:
import yaml

data  = """
names:
  0: shirt, blouse
  1: top, t-shirt, sweatshirt
  2: sweater
  3: cardigan
  4: jacket
  5: vest
  6: pants
  7: shorts
  8: skirt
  9: coat
  10: dress
  11: jumpsuit
  12: cape
  13: glasses
  14: hat
  15: headband, head covering, hair accessory
  16: tie
  17: glove
  18: watch
  19: belt
  21: tights, stockings
  22: sock
  23: shoe
  24: bag, wallet
  25: scarf
  27: hood
  38: bow
  39: flower
  43: ruffle

path: /kaggle/input/fashionpedia-dataset
test: images/test
train: images/train
val: images/test
"""

with open('data2.yaml', 'w') as f:
    yaml.dump(yaml.safe_load(data), f)

In [9]:
import os
import json

with open('../data/category.json', 'r') as f:
    classes = json.load(f)

remap_dict = {old: new for new, old in enumerate(classes)}

labels_dir = 'labels/val'

for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(labels_dir, filename)

        with open(file_path, "r") as file:
            lines = file.readlines()

        remapped_lines = []
        for line in lines:
            parts = line.strip().split()
            if not parts:
                continue
            old_class_id = int(parts[0])
            new_class_id = remap_dict.get(old_class_id)
            if new_class_id is not None:
                remapped_line = " ".join([str(new_class_id)] + parts[1:]) + "\n"
                remapped_lines.append(remapped_line)

        # Overwrite file with remapped labels
        with open(file_path, "w") as file:
            file.writelines(remapped_lines)

In [18]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict

# Load your data
full_catalog_df = pd.read_csv("/home/pchhalotre/Desktop/flickd-hackathon/backend/data/catalog.csv")  # Original, unfiltered
with open("/home/pchhalotre/Desktop/flickd-hackathon/backend/models/catalog_product_ids.json", "r") as f:
    product_ids = json.load(f)

embeddings = np.load("/home/pchhalotre/Desktop/flickd-hackathon/backend/models/catalog_clip_embeddings.npy")
assert len(product_ids) == len(embeddings)

# Step 1: Group image rows by product_id
pid_to_rows = defaultdict(list)
for idx, row in full_catalog_df.iterrows():
    pid = str(row['id'])  # Ensure string type
    pid_to_rows[pid].append(idx)

# Step 2: For each product_id in saved order, get first unused image
used_counts = defaultdict(int)
final_indices = []

for pid in product_ids:
    idx_list = pid_to_rows[pid]
    count = used_counts[pid]
    if count >= len(idx_list):
        raise ValueError(f"Not enough images for product_id {pid}")
    final_indices.append(idx_list[count])
    used_counts[pid] += 1

# Step 3: Filter the catalog
filtered_catalog_df = full_catalog_df.iloc[final_indices].reset_index(drop=True)

# Final sanity check
assert len(filtered_catalog_df) == len(embeddings) == len(product_ids)

# Save recovered catalog
filtered_catalog_df.to_csv("filtered_catalog.csv", index=False)
print("✅ filtered_catalog.csv saved and aligned with embeddings.")

# Optionally save indices
with open("recovered_indices.json", "w") as f:
    json.dump(final_indices, f, indent=2)


✅ filtered_catalog.csv saved and aligned with embeddings.


In [11]:
import json
with open("/home/pchhalotre/Desktop/flickd-hackathon/backend/models/catalog_product_ids.json", "r") as f:
    product_ids = json.load(f)

len(product_ids)

8135

In [14]:
import numpy as np
embeddings = np.load("/home/pchhalotre/Desktop/flickd-hackathon/backend/models/catalog_clip_embeddings.npy")

embeddings.shape

(8135, 768)

In [20]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import imagehash
from collections import defaultdict

# Load your filtered catalog
df = pd.read_csv("filtered_catalog.csv")

# Group by product_id
deduped_rows = []
hash_map_per_product = defaultdict(set)

for _, row in df.iterrows():
    product_id = row['id']
    image_url = row['image_url']

    try:
        # Download and hash image
        response = requests.get(image_url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img_hash = imagehash.phash(img)  # perceptual hash (can use dhash/ahash too)

        # If hash is not already used for this product_id, keep it
        if img_hash not in hash_map_per_product[product_id]:
            hash_map_per_product[product_id].add(img_hash)
            deduped_rows.append(row)
    except Exception as e:
        print(f"⚠️ Failed to process {image_url}: {e}")

# Create updated DataFrame
deduped_df = pd.DataFrame(deduped_rows)

# Save updated catalog
deduped_df.to_csv("deduplicated_catalog.csv", index=False)
print(f"✅ Saved deduplicated catalog with {len(deduped_df)} rows.")


⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1948_1600x.jpg?v=1738757403: cannot identify image file <_io.BytesIO object at 0x7eed168a9440>
⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1968_1600x.jpg?v=1738757402: cannot identify image file <_io.BytesIO object at 0x7eed2dc74770>
⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1971_1600x.jpg?v=1738757403: cannot identify image file <_io.BytesIO object at 0x7eed2deff560>
⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1973_1600x.jpg?v=1738757403: cannot identify image file <_io.BytesIO object at 0x7eed2dc740e0>
⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1977_86dfa1dd-099e-4b31-8b42-0e3fda8d204e_1600x.jpg?v=1738757403: cannot identify image file <_io.BytesIO object at 0x7eed40282250>
⚠️ Failed to process https://cdn.shopify.com/s/files/1/0785/1674/8585/files/DSCF1984_160

KeyboardInterrupt: 