## CQ500 Preprocessing to COCO format

Use this notebook if you want to preprocess the CQ500 dataset with bounding boxes to coco format

In [None]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('qureai-cq500-boxes-updated.csv')
df.head()

In [None]:
coco = {
  "info": {
    "description": "Stroke detection dataset",
    "version": "1.0",
    "year": 2023,
    "contributor": "Marc Becker",
    "date_created": "2017/09/01"
  },
  "licenses": [],
  "images": [],
  "annotations": [],
  "categories": [
    {
      "id": 1,
      "name": "Hemmoraghe",
      "supercategory": "Hemmoraghe"
    }
  ]
}

# Define the split ratio (e.g., 0.8 for 80% train, 20% validation)
split_ratio = 0.8
# Create a list of all image filenames
import random

image_filenames = os.listdir('./data/cq500')
random.shuffle(image_filenames)  # Randomly shuffle the images

# Split the images into train and validation sets based on the split_ratio
train_size = int(len(image_filenames) * split_ratio)
train_images = image_filenames[:train_size]
val_images = image_filenames[train_size:]

# Add metadata for training images
coco['images'] = []
for filename in train_images:
    coco['images'].append({
        "id": filename.split('.dcm')[0],
        "width": 512,
        "height": 512,
        "file_name": filename
    })

print(f"Added metadata for {len(coco['images'])} training images")

# Add metadata for validation images
coco['images_val'] = []
for filename in val_images:
    coco['images_val'].append({
        "id": filename.split('.dcm')[0],
        "width": 512,
        "height": 512,
        "file_name": filename
    })

print(f"Added metadata for {len(coco['images_val'])} validation images")

# Add annotations for training images
coco['annotations'] = []
for index, row in df.iterrows():
    if isinstance(row['data'], str):
        bbox = eval(row['data'])
        if row['SOPInstanceUID'] + '.dcm' in train_images:
            coco['annotations'].append({
                "id": index,
                "image_id": row['SOPInstanceUID'],
                "category_id": 1,
                "bbox": [bbox['x'], bbox['y'], bbox['width'], bbox['height']],
            })

print(f"Added {len(coco['annotations'])} annotations for training set")

# Add annotations for validation images
coco['annotations_val'] = []
for index, row in df.iterrows():
    if isinstance(row['data'], str):
        bbox = eval(row['data'])
        if row['SOPInstanceUID'] + '.dcm' in val_images:
            coco['annotations_val'].append({
                "id": index,
                "image_id": row['SOPInstanceUID'],
                "category_id": 1,
                "bbox": [bbox['x'], bbox['y'], bbox['width'], bbox['height']],
            })
            
print(f"Added {len(coco['annotations_val'])} annotations for validation set")
import json

# Filter annotations for training and validation sets
coco_train = {
    "info": coco["info"],
    "licenses": coco["licenses"],
    "images": coco["images"],
    "annotations": [anno for anno in coco["annotations"] if anno["image_id"] in train_images],
    "categories": coco["categories"]
}

coco_val = {
    "info": coco["info"],
    "licenses": coco["licenses"],
    "images": coco["images_val"],
    "annotations": [anno for anno in coco["annotations_val"] if anno["image_id"] in val_images],
    "categories": coco["categories"]
}

# Save the training and validation data to separate JSON files
with open("annotations_train.json", "w") as f:
    json.dump(coco_train, f, indent=2)

with open("annotations_val.json", "w") as f:
    json.dump(coco_val, f, indent=2)