In [1]:
import json
from pathlib import Path
import os
import shutil
import cv2
import itertools
import numpy as np
from typing import List, Dict
from sklearn.model_selection import train_test_split

In [15]:
DATA_DIR = Path('data/')

In [4]:
os.listdir("data")

['sample_submission.csv',
 'test',
 'train',
 'polygons.jsonl',
 'tile_meta.csv',
 'wsi_meta.csv']

In [6]:
with open('data/polygons.jsonl' , 'r') as json_file :
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))

In [17]:
id_dict = {'blood_vessel': 0, 'glomerulus': 1, 'unsure': 2}

In [9]:
# Function to copy images and transform labels to 
# coco formatted .txt files
def tile_to_coco(tile: List[Dict], output_folder: Path):
    tile_id = tile['id']    
    
    # Copy image
    shutil.copyfile(DATA_DIR / f'train/{tile_id}.tif', output_folder / f'{tile_id}.tif')
    
    # Create text file and write formatted labels to it
    with open(output_folder / f'{tile_id}.txt', 'w') as text_file:
        for annotation in tile['annotations']:
            
            class_id = id_dict[annotation['type']]
            flat_mask_polygon = list(itertools.chain(*annotation['coordinates'][0]))
            # Divide by 512 because coco labels expect positions between 0 and 1
            # not pixel indices
            array = np.array(flat_mask_polygon)/512.
            text_file.write(f'{class_id} {" ".join(map(str, array))}\n')
            

In [10]:
# Split into train and validation 
train_dicts, valid_dicts = train_test_split(tiles_dicts, test_size=0.2, random_state=42)

In [13]:
os.mkdir('data/train_coco')
os.mkdir('data/valid_coco')

In [18]:
for train_dict in train_dicts: 
    tile_to_coco(train_dict, Path('data/train_coco'))
for valid_dict in valid_dicts: 
    tile_to_coco(valid_dict, Path('data/valid_coco'))

In [19]:
# Create a yaml file as expected by YOLOv7 (and others)
yaml_text = """
# HuBMAP - Hacking the Human Vasculature dataset 
# https://www.kaggle.com/competitions/hubmap-hacking-the-human-vasculature


# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
train: /kaggle/input/hubmap-hhv-coco/train/
val: /kaggle/input/hubmap-hhv-coco/valid/

# class names
names: 
  0: blood_vessel
  1: glomerulus
  2: unsure
"""

with open('data/hubmap-coco.yaml', 'w') as text_file:
    text_file.write(yaml_text)