# Download and convert dataset

In [None]:
# List of JSON file URLs
urls = [
    "https://s3.amazonaws.com/ifashionist-dataset/annotations/instances_attributes_train2020.json",
    "https://s3.amazonaws.com/ifashionist-dataset/annotations/instances_attributes_val2020.json",
    "https://s3.amazonaws.com/ifashionist-dataset/annotations/info_test2020.json",
    "https://s3.amazonaws.com/ifashionist-dataset/annotations/attributes_train2020.json",
    "https://s3.amazonaws.com/ifashionist-dataset/annotations/attributes_val2020.json"
]

# Download each file into current directory (/content)
for url in urls:
    filename = url.split("/")[-1]
    !wget -O $filename $url

In [None]:
zip_urls = [
    "https://s3.amazonaws.com/ifashionist-dataset/images/train2020.zip",
    "https://s3.amazonaws.com/ifashionist-dataset/images/val_test2020.zip"
]

for url in zip_urls:
    filename = url.split("/")[-1]
    print(f"Downloading {filename}...")
    !wget -q --show-progress -O $filename $url
    
    print(f"Extracting {filename}...")
    !unzip -o $filename > /dev/null 2>&1

In [None]:
%matplotlib inline

import json
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
dataset_path = "/"
img_train_root = dataset_path + "train/"
img_val_root = dataset_path + "test/"

# original names
ann_train_file = "instances_attributes_train2020.json"
ann_val_file = "instances_attributes_val2020.json"
# ann_train_file = "attributes_train2020.json"
# ann_val_file = "attributes_val2020.json"

info_file = "info_test2020.json"

# for commercial use names
ann_com_train = "instances_attributes_commercial_train_filtered.csv"
ann_com_val = "instances_attributes_commercial_val_filtered.csv"
# ann_com_train = "attributes_commercial_train.csv"
# ann_com_val = "attributes_commercial_val.csv"

### Remove images with licenses unsuitable for commercial use and remove annotations with not needed categories

In [None]:
# ['glasses', 'hat', 'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'leg warmer', 
# 'tights, stockings', 'sock', 'umbrella', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon',
#  'rivet', 'ruffle', 'sequin', 'tassel']

def filter_categories(df):
    remove_cat_ids = [13, 14, 15, 16, 17, 18, 20, 21, 22, 26, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
    return df[~df["category_id"].isin(remove_cat_ids)].copy()

# Load JSON
def load_cat_attr(dataset_path, filename):
    with open(dataset_path+filename, "r") as f:
        data = json.load(f)

    # Convert to DataFrames
    df_attributes = pd.DataFrame(data["attributes"])
    df_categories = pd.DataFrame(data["categories"])
    return df_categories, df_attributes

df_categories, df_attributes = load_cat_attr(dataset_path, info_file)

id_to_attr_name = df_attributes.set_index('id')['name'].to_dict()
id_to_cat_name = df_categories.set_index('id')['name'].to_dict()

# Count the frequency of each category
def plot_counts(df):
    category_counts = df["category_id"].value_counts()

    # Map IDs → names
    category_counts.index = category_counts.index.map(id_to_cat_name)
    # Plot
    plt.figure(figsize=(12,6))
    category_counts.plot(kind="bar")

    plt.title("Distribution of Categories")
    plt.xlabel("Category ID")
    plt.ylabel("Number of Annotations")
    plt.show()

def convert_json_to_csv(dataset_path, filename, new_filename):
    with open(dataset_path+filename, "r") as f:
        data = json.load(f)

    # Convert to DataFrames
    df_annotations = pd.DataFrame(data["annotations"])
    df_images = pd.DataFrame(data["images"])

    # safe for commercial use: 0, 1, 6, 7, 8, 9, 10
    allowed_ids = [0, 1, 6, 7, 8, 9, 10]
    # keep images only with allowed licences
    df_images_filtered = df_images[df_images['license'].isin(allowed_ids)].copy()
    # print(df_images_filtered.head())

    print(f"Original images: {len(df_images)}")
    print(f"Filtered images: {len(df_images_filtered)}")

    # print(df_annotations.head())
    # merge images with attributes and drop not useful columns
    df_annotations_filtered = df_annotations.merge(
        df_images_filtered,
        left_on='image_id',
        right_on='id',
        how='inner'
    ).drop(columns=['id_x', 'id_y', 'time_captured', 'segmentation', 'area', 'iscrowd', 'original_url', 'isstatic', 'kaggle_id'])
    # print(df_annotations_filtered.head())
    # save to a csv file

    df_annotations_filtered = filter_categories(df_annotations_filtered)
    plot_counts(df_annotations_filtered)
    df_annotations_filtered.to_csv(dataset_path+new_filename,index=False)


# uncomment to convert and filter dataset
convert_json_to_csv(dataset_path, ann_train_file, ann_com_train)
convert_json_to_csv(dataset_path, ann_val_file, ann_com_val)

Original images: 1158
Filtered images: 330
   image_id  category_id                                     attribute_ids  \
0     17039           31                                        [160, 204]   
1     17039           31                                        [160, 204]   
2     17039            9  [229, 295, 136, 137, 80, 145, 115, 85, 311, 317]   
3     17039           25                                                []   
4     17039           27                                                []   

                           bbox  width  height  \
0   [391.0, 460.0, 75.0, 193.0]   1024     682   
1   [583.0, 452.0, 50.0, 228.0]   1024     682   
2  [392.0, 407.0, 241.0, 274.0]   1024     682   
3  [460.0, 389.0, 136.0, 109.0]   1024     682   
4   [442.0, 407.0, 166.0, 51.0]   1024     682   

                              file_name  license  
0  99601fa457d157b81154d089966c2e3a.jpg        7  
1  99601fa457d157b81154d089966c2e3a.jpg        7  
2  99601fa457d157b81154d089966c2e3

# Load data with DataLoaders

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from super_gradients.training.utils.collate_fn.detection_collate_fn import DetectionCollateFN
import os
import cv2
import numpy as np

In [None]:
NUM_CLASSES = 46

#### Create custom Dataset class that loads data from .csv

In [None]:
def letterbox_image(image, max_size=640, color=(114, 114, 114)):
    h, w = image.shape[:2]
    scale = max_size / max(h, w)
    new_w, new_h = int(w * scale), int(h * scale)
    
    # Resize while keeping aspect ratio
    resized_image = cv2.resize(image, (new_w, new_h))
    
    # Create padded image
    padded_image = np.full((max_size, max_size, 3), color, dtype=np.uint8)
    
    # Compute top-left corner for placing resized image
    top = (max_size - new_h) // 2
    left = (max_size - new_w) // 2
    
    padded_image[top:top+new_h, left:left+new_w] = resized_image
    return padded_image, scale, left, top

def coco_to_yolo_all(boxes, img_width, img_height):
    """
    Convert multiple COCO bboxes to YOLO format.

    Parameters:
        boxes: numpy array of shape (N, 4), COCO format [x_min, y_min, w, h]
        img_width: int
        img_height: int

    Returns:
        yolo_boxes: numpy array of shape (N, 4), YOLO format [x_center, y_center, w, h]
    """
    # Compute center coordinates
    x_center = boxes[:, 0] + boxes[:, 2] / 2
    y_center = boxes[:, 1] + boxes[:, 3] / 2

    # Normalize
    x_center_norm = x_center / img_width
    y_center_norm = y_center / img_height
    w_norm = boxes[:, 2] / img_width
    h_norm = boxes[:, 3] / img_height

    yolo_boxes = np.stack([x_center_norm, y_center_norm, w_norm, h_norm], axis=1)
    return yolo_boxes


def convert_image_to_yolo(image, allboxes, max_size=640):
    """
    Convert an image and its bounding boxes to a format suitable for YOLO training.

    Parameters:
        image: numpy array of shape (H, W, C)
        boxes: numpy array of shape (N, 4) with COCO format [x_min, y_min, width, height]
        max_size: int, size to which the image will be resized 

    Returns:
        image_tensor: torch.Tensor of shape (C, H, W)
        target_array: numpy array of shape (N, 5) with YOLO format [class_label, x_center_norm, y_center_norm, width_norm, height_norm]
    """

    # Letterbox resize
    padded_img, scale, pad_left, pad_top = letterbox_image(image, max_size)

    # Adjust bounding boxes after resizing + padding
    boxes = allboxes * scale
    boxes[:, 0] += pad_left   # x
    boxes[:, 1] += pad_top    # y
    new_boxes = coco_to_yolo_all(boxes, max_size, max_size)

    return padded_img, new_boxes

In [None]:
class CustomCSVDataset(Dataset):
    def __init__(self, csv_file, images_dir, max_size=640):
        """
        :param csv_file: path to the annotations csv file
        :param images_dir: folder where images are stored
        """
        self.annotations = pd.read_csv(csv_file)
        self.images_dir = images_dir
        self.max_size = max_size

        # Group annotations by image for easier retrieval
        self.image_groups = self.annotations.groupby("image_id")

        # Keep unique image rows
        self.image_infos = self.annotations.drop_duplicates("image_id")

    def __len__(self):
        return len(self.image_infos)

    def __getitem__(self, idx):
        # Get one image info row
        img_info = self.image_infos.iloc[idx]

        img_path = os.path.join(self.images_dir, img_info["file_name"])
        
        # Load image with OpenCV (BGR)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Get annotations for this image
        annots = self.image_groups.get_group(img_info["image_id"])
        boxes = np.stack(annots["bbox"].apply(eval).values)  # [x-top-left,y-top-left,w,h]
        labels = annots["category_id"].values
        # attributes = annots["attribute_ids"].apply(lambda x: eval(x) if isinstance(x, str) else []).values
        
        # image should be resized to max_size x max_size
        # and padded with some color if necessary
        # yolo takes bbox in [normalized_x_center, normalized_y_center, normalized_bbox_width, normalized_bbox_height] format
        padded_img, padded_boxes = convert_image_to_yolo(image, boxes, self.max_size)

        image_tensor = torch.from_numpy(padded_img).permute(2, 0, 1).float()
        # target = {
        #     "boxes": torch.tensor(boxes, dtype=torch.float32),
        #     "labels": torch.tensor(labels, dtype=torch.int64),
            # "attributes": attributes,
            # "image_id": torch.tensor(img_info["image_id"]),
        # }
        target_array = np.hstack((labels.reshape(-1, 1), padded_boxes))

        return image_tensor, target_array



In [None]:
train_dataset = CustomCSVDataset(dataset_path+ann_com_train, img_train_root)
val_dataset = CustomCSVDataset(dataset_path+ann_com_val, img_val_root)

##### Examples of dataset

In [None]:
val_dataset[1][0]

In [None]:
val_dataset[1][0].shape

In [None]:
val_dataset[1][1]

In [None]:
val_dataset[1][1].shape

#### Dataloaders

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=DetectionCollateFN())
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=2, collate_fn=DetectionCollateFN())

In [None]:
next(iter(val_dataloader))[1]

# Model and its params

In [None]:
import super_gradients
from super_gradients.training import models
from super_gradients import Trainer
from super_gradients.training import training_hyperparams

In [None]:
model = models.get("yolox_n", num_classes=NUM_CLASSES, pretrained_weights="coco")
print('Num classes in the model:', model.num_classes)

In [None]:
train_params = training_hyperparams.get('coco2017_yolox')
train_params

In [None]:
# edit params
train_params['max_epochs'] = 10
train_params['lr_warmup_epochs'] = 0
train_params['lr_cooldown_epochs'] = 0
train_params['criterion_params']['num_classes'] = NUM_CLASSES
train_params['average_best_models'] = False
train_params['initial_lr'] = 0.0005
train_params['cosine_final_lr_ratio'] = 0.9
train_params['mixed_precision'] = False

# Training

In [None]:
CHECKPOINT_DIR = '/ckpt_folder' # Local path
trainer = Trainer(experiment_name='transfer_learning_object_detection_yolox', ckpt_root_dir=CHECKPOINT_DIR)

In [None]:
trainer.train(model=model, training_params=train_params, train_loader=train_dataloader, valid_loader=val_dataloader)

# Prediction

In [None]:
# url = "https://github.com/KMnP/fashionpedia-api/blob/1ef732050e15d446c38d58ef945ccadc28c59328/images/000000010223.jpg"
# url = "https://github.com/KMnP/fashionpedia-api/blob/1ef732050e15d446c38d58ef945ccadc28c59328/images/000000009813.jpg"
# url = "https://github.com/KMnP/fashionpedia-api/blob/1ef732050e15d446c38d58ef945ccadc28c59328/data/demo/input.jpg"
# prediction = model.predict(url)
# prediction.show()

In [None]:
# compare results:
# "https://github.com/KMnP/fashionpedia-api/blob/1ef732050e15d446c38d58ef945ccadc28c59328/data/demo/result.pdf"

# Convert to ONNX

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# !pip install onnx_graphsurgeon==0.3.27 --extra-index-url https://pypi.ngc.nvidia.com

In [None]:
# export_result = model.export(output="myexport.onnx")

In [None]:
# export_result