In [1]:
# import sys
# sys.path.append("../")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 101)

import os
import json
import matplotlib.pyplot as plt
# from imaterialist.utils.paths import root_dir
# from notebooks.helpers.dataset_analysis_notebook import *
# from notebooks.helpers.category_overlaps_analysis_notebook import *
from utils_data import *
from glob import glob
import PIL.Image

%load_ext autoreload
%autoreload 2

# Load Taxonomy

In [2]:
with open('imaterialist-fashion-2021-fgvc8/label_descriptions.json') as f:
    label_desc = json.load(f)

cat_df = pd.DataFrame(label_desc['categories']).set_index('id').sort_index()
attr_df = pd.DataFrame(label_desc['attributes']).set_index('id').sort_index()

In [3]:
attr_df

Unnamed: 0_level_0,name,supercategory,level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,classic (t-shirt),nickname,1
1,polo (shirt),nickname,1
2,undershirt,nickname,1
3,henley (shirt),nickname,1
4,ringer (t-shirt),nickname,1
...,...,...,...
336,peacock,animal,2
337,zebra,animal,2
338,giraffe,animal,2
339,toile de jouy,textile pattern,1


In [3]:
cat_df.head()

Unnamed: 0_level_0,name,supercategory,level
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"shirt, blouse",upperbody,2
1,"top, t-shirt, sweatshirt",upperbody,2
2,sweater,upperbody,2
3,cardigan,upperbody,2
4,jacket,upperbody,2


In [4]:
len(cat_df)

46

# Load Training, Validation and Test Datasets

In [None]:
train_df = pd.read_csv(root_dir('data/imaterialist-fashion-2020-fgvc7/split/train.csv'))
train_df['AttributesIds'] = train_df['AttributesIds'].fillna('')
print(len(train_df))
train_df.head()

In [5]:
valid_df = pd.read_csv(root_dir('data/imaterialist-fashion-2020-fgvc7/split/valid.csv'))
valid_df['AttributesIds'] = valid_df['AttributesIds'].fillna('')
print(len(valid_df))
valid_df.head()

44509


Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds
0,0000fe7c9191fba733c8a69cfaf962b7,2201176 1 2203623 3 2206071 5 2208518 8 221096...,2448,2448,33,190
1,0000fe7c9191fba733c8a69cfaf962b7,1343707 9 1346138 27 1348569 44 1351000 62 135...,2448,2448,1,0115145146295316317
2,00048c3a2fb9c29340473c4cfc06424a,239016 3 239989 6 240963 7 241938 8 242913 9 2...,975,650,13,
3,00048c3a2fb9c29340473c4cfc06424a,257702 10 258670 26 259640 36 260572 6 260610 ...,975,650,1,11115135145146295316317
4,00048c3a2fb9c29340473c4cfc06424a,277152 1 278126 3 279102 2 280078 2 281053 3 2...,975,650,33,182


In [5]:
rows = []
num_limit = 300
c = 0
for file_path in glob('test/*.jpg'):
    image_id = os.path.basename(file_path).split('.')[0]
    image = PIL.Image.open(file_path)

    rows.append({
        'ImageId': image_id,
        'EncodedPixels': '1 1',
        'Height': image.height,
        'Width': image.width,
        'ClassId': 0,
        'AttributesIds': '',
    })
    c += 1
    if c== num_limit:
        break

test_df = pd.DataFrame(rows, columns=['ImageId', 'EncodedPixels', 'Height', 'Width', 'ClassId', 'AttributesIds'])
print(len(test_df))
test_df.tail()

300


Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,AttributesIds
295,a192b4188273b1c78e629e65e607f7a6,1 1,1024,682,0,
296,a19dbf63dbbcb28c0bcc4fa15ae975e2,1 1,682,1024,0,
297,634b8e06e0d3bc3b8258cdef7ea56251,1 1,683,1024,0,
298,634fe6afefee27498551381955f39dba,1 1,1024,682,0,
299,6350070620d5693afb9a6d58943381f2,1 1,1024,490,0,


In [6]:
def get_resized_image_info(image_width: int, image_height: int, new_image_size: int):
    if image_width > image_height:
        scale = image_width / new_image_size
        new_width = new_image_size
        new_height = int(image_height / scale)
    else:
        scale = image_height / new_image_size
        new_height = new_image_size
        new_width = int(image_width / scale)

    return new_width, new_height, scale


def generate_coco_annotations(df: pd.DataFrame, image_size: int = None):
    imgs_df = df.groupby(['ImageId'], as_index=False).agg({
        'Height': 'first',
        'Width': 'first',
    })
    imgs_df.columns = ['ImageId', 'Height', 'Width']

    attributes_map = {attr_id: i for i, attr_id in enumerate(attr_df.index)}

    info = {
        'num_attributes': len(attributes_map),
    }

    categories = []
    for category_id, row in cat_df.iterrows():
        categories.append({
            'id': category_id + 1,
            'name': row['name'],
            'supercategory': row['supercategory'],
        })

    images = []
    image_ids = {}
    for image_id, row in imgs_df.iterrows():
        width = row['Width']
        height = row['Height']
        if image_size is not None:
            width, height, _ = get_resized_image_info(width, height, image_size)

        images.append({
            'id': image_id + 1,
            'width': width,
            'height': height,
            'file_name': row['ImageId'] + '.jpg',
        })
        image_ids[row['ImageId']] = image_id + 1

    annotations = []
    for i, (annotation_id, row) in enumerate(df.iterrows()):
        mask = rleToMask(row['EncodedPixels'], row['Height'], row['Width'])
        bbox = mask_to_bbox(mask)

        # if image_size is not None:
        #     new_width, new_height, scale = get_resized_image_info(row['Width'], row['Height'], image_size)

        #     # resize bbox
        #     bbox = (np.array(bbox, dtype=np.float32) / scale).tolist()

        #     # resize and encode mask
        #     pil_image = Image.fromarray(mask.astype(np.uint8))
        #     pil_image = pil_image.resize((new_width, new_height), Image.NEAREST)
        #     mask = np.asarray(pil_image)
        #     rle = binary_mask_to_rle(mask)
        # else:
        rle = binary_mask_to_coco_rle(mask)

        annotations.append({
            'id': annotation_id + 1,
            'image_id': image_ids[row['ImageId']],
            'category_id': int(row['ClassId']) + 1,
            'segmentation': rle,
            'area': int(mask.sum()),
            'bbox': bbox,
            'iscrowd': 0,
            'attribute_ids': [attributes_map[int(attr_id)] 
                              for attr_id in row['AttributesIds'].split(',')] if row['AttributesIds'] != '' else [],
        })
        
        if i % 1000 == 0:
            print(i)

    return {
        'info': info,
        'images': images,
        'categories': categories,
        'annotations': annotations,
    }

In [7]:
test_annotations = generate_coco_annotations(test_df)
with open('data_processed/test_coco.json', 'w') as f:
    json.dump(test_annotations, f, cls=NumpyEncoder)

0


In [16]:
# valid_annotations = generate_coco_annotations(valid_df)
# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/valid_coco.json'), 'w') as f:
#     json.dump(valid_annotations, f)

# train_annotations = generate_coco_annotations(train_df)
# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/train_coco.json'), 'w') as f:
#     json.dump(train_annotations, f)

# test_annotations = generate_coco_annotations(test_df)
# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/test_coco.json'), 'w') as f:
#     json.dump(test_annotations, f)

0


In [29]:
# image_size = 1024
# valid_annotations = generate_coco_annotations(valid_df, image_size)
# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/valid_coco_%d.json' % image_size), 'w') as f:
#     json.dump(valid_annotations, f)

In [5]:
# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/valid_coco.json'), 'r') as f:
#     valid_annotations = json.load(f)

# with open(root_dir('data/imaterialist-fashion-2020-fgvc7/split/train_coco.json'), 'r') as f:
#     train_annotations = json.load(f)