In [15]:
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import json
import glob
import pycocotools
from pycocotools import mask
import random
import cv2
import re
import torch

In [16]:
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

In [17]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

In [18]:
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [19]:
def flatten_l_o_l(nested_list):
    """ Flatten a list of lists """
    return [item for sublist in nested_list for item in sublist]

In [20]:
def load_json_to_dict(json_path):
    """ tbd """
    with open(json_path) as json_file:
        data = json.load(json_file)
    return data

In [21]:
def get_img_and_mask(img_path, annotation, width, height):
    """ Capture the relevant image array as well as the image mask """
    img_mask = np.zeros((height, width), dtype=np.uint8)
    for i, annot in enumerate(annotation): 
        img_mask = np.where(rle_decode(annot, (height, width))!=0, i, img_mask)
    img = cv2.imread(img_path)[..., ::-1]
    return img[..., 0], img_mask

In [22]:
config = {'width':704,'height':520,'seed':42,'folds':5}

In [23]:
train_df = pd.read_csv('/home/data1/hanfeng/code/kaggle/kaggle-Sartorius-main/swin-mmdet/dataset/train.csv')
lines = []
for f in train_df.itertuples():
    lines.append('../input/sartorius-cell-instance-segmentation/train/' + f[1] + '.png')
lins = pd.Series(lines, name='img_path')
train_df = pd.concat([train_df, lins], axis=1)

tmp_df = train_df.drop_duplicates(subset=["id", "img_path"]).reset_index(drop=True)
tmp_df["annotation"] = train_df.groupby("id")["annotation"].agg(list).reset_index(drop=True)
train_df = tmp_df.copy()

In [24]:
from sklearn.model_selection import KFold,StratifiedKFold
random_seed(config['seed'])
from sklearn.model_selection import KFold,StratifiedKFold
Fold = KFold(n_splits=config['folds'], shuffle=True, random_state=42)
train_df['fold'] = -1
for n, (train_index, val_index) in enumerate(Fold.split(train_df)):
    train_df.iloc[val_index,-1] = int(n)

In [25]:
def get_img_and_annot_info(df,annot_id_start=1):
    for f in df.itertuples():
        image_id = f[1]
        file_path = f[-2]
        width = f[3]
        height = f[4]
        category = categories[f[5]]
        image_info = {
            "id": image_id,
            "width": width,
            "height": height,
            "file_name": file_path,
        }
        output_json_dict['images'].append(image_info)
        for annot in np.unique(f[2]):
            annotation = rle_decode(annot, (config['height'], config['width']))
            _, count = np.unique(annotation, return_counts=True)
            annot_mask = annotation.astype(np.bool)
            annot_mask = np.asfortranarray(annot_mask)
            Rs = mask.encode(annot_mask)
            Rs['counts'] = Rs['counts'].decode('utf-8')
            bbox = mask.toBbox(Rs)
            bbox_list = []
            for element in bbox:
                bbox_list.append(int(element))
            annot_dict = {
                "category_id": category,
                "segmentation": Rs,
                "area": int(mask.area(Rs)),
                "bbox": bbox_list,
                "id": annot_id_start,
                "image_id": image_id,
                "iscrowd": 0}
            output_json_dict["annotations"].append(annot_dict)
            annot_id_start += 1

In [26]:
categories = {"cort": 2, "shsy5y": 1, "astro": 3}

In [27]:
def get_img_and_annot_info(df,output_json_dict,annot_id_start=1):
    for f in df.itertuples():
        image_id = f[0]
        file_path = f[-2]
        width = f[3]
        height = f[4]
        category = categories[f[5]]
        image_info = {
            "id": image_id,
            "width": width,
            "height": height,
            "file_name": file_path,
        }
        output_json_dict['images'].append(image_info)
        for annot in np.unique(f[2]):
            annotation = rle_decode(annot, (config['height'], config['width']))
            _, count = np.unique(annotation, return_counts=True)
            annot_mask = annotation.astype(np.bool)
            annot_mask = np.asfortranarray(annot_mask)
            Rs = mask.encode(annot_mask)
            Rs['counts'] = Rs['counts'].decode('utf-8')
            bbox = mask.toBbox(Rs)
            bbox_list = []
            for element in bbox:
                bbox_list.append(int(element))
            annot_dict = {
                "category_id": category,
                "segmentation": Rs,
                "area": int(mask.area(Rs)),
                "bbox": bbox_list,
                "id": annot_id_start,
                "image_id": image_id,
                "iscrowd": 0}
            output_json_dict["annotations"].append(annot_dict)
            annot_id_start += 1

In [28]:
from copy import deepcopy
for fold in range(config['folds']):
    trn_idx = train_df[train_df['fold'] != fold].index
    val_idx = train_df[train_df['fold'] == fold].index

    train_folds = deepcopy(train_df.loc[trn_idx].reset_index(drop=True))
    valid_folds = deepcopy(train_df.loc[val_idx].reset_index(drop=True))
    
    final_train = deepcopy(train_folds)
    final_eval = deepcopy(valid_folds)

    output_json_dict = {
        "images": [],
        "annotations": [],
        "categories": []
    }

    category_dict = {"id": 1, "name": "shsy5y", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)
    category_dict = {"id": 2, "name": "cort", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)
    category_dict = {"id": 3, "name": "astro", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)

    get_img_and_annot_info(final_train,output_json_dict)
    with open(f'{fold}_new_train_dataset.json', 'w') as f:
        output_json = json.dumps(output_json_dict)
        f.write(output_json)
        f.close()

    output_json_dict = {
        "images": [],
        "annotations": [], 
        "categories": []
    }

    category_dict = {"id": 1, "name": "shsy5y", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)
    category_dict = {"id": 2, "name": "cort", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)
    category_dict = {"id": 3, "name": "astro", "supercategory": "none"}
    output_json_dict["categories"].append(category_dict)

    get_img_and_annot_info(final_eval,output_json_dict)
    with open(f'{fold}_new_val_dataset.json', 'w') as f:
        output_json = json.dumps(output_json_dict)
        f.write(output_json)
        f.close()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
