In [5]:
from __future__ import annotations
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
# import numpy as np
from collections import namedtuple
import pandas as pd
import copy
import matplotlib.pyplot as plt
import cv2
import math
import os
import rasterio
from matplotlib.path import Path
# from tqdm import tqdm

### Reading TRAIN, VAL, TEST JSON

In [2]:
def read_json_annotations(train_path: str, val_path: str, test_path: str) -> list[dict]:
    """
    Reading Json annotations of LIVECell Dataset.
    """
    print('Reading train')
    with open(train_path, 'r') as f:
        train_annotations = json.load(f)
#         print(type(train_annotations))
        

    print('Reading val')
    with open(val_path, 'r') as f:
        val_annotations = json.load(f)

    print('Reading test')    
    with open(test_path, 'r') as f:
        test_annotations = json.load(f)
        
    return [train_annotations, val_annotations, test_annotations]

In [3]:
train_json_path = '/workspace/annotations/LIVECell/livecell_coco_train.json'
val_json_path = '/workspace/annotations/LIVECell/livecell_coco_val.json'
test_json_path = '/workspace/annotations/LIVECell/livecell_coco_test.json'

# train_annotations, val_annotations, test_annotations
annotations = read_json_annotations(train_json_path, val_json_path, test_json_path)

Reading train
Reading val
Reading test


In [6]:
def mask_decode(mask):
    array = np.zeros((520, 704))
    for label in mask:
        s = label.split()
        starts = list(map(lambda x: int(x) - 1, s[0::2]))
        lengths = list(map(int, s[1::2]))
        ends = [x + y for x, y in zip(starts, lengths)]
        img = np.zeros((520*704), dtype=np.float32)            
        for start, end in zip(starts, ends):
            img[start : end] = 1 
        array += img.reshape((520, 704))
    return array.clip(0, 1)

def rle_encode(img):
    """ 
    Args:
        img (np.array): 
            - 1 indicating mask
            - 0 indicating background
    
    Returns: 
        run length as string formated
    """
    
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

PatientInfoTuple = namedtuple(
    'PatientInfoTuple',
    'id, cell_type, annotations'
)

def getPatientsInfo():
    df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
    patientsInfo_list = list()
    
    for id in df.id.unique():
        cell_type = df[df.id == id].cell_type.unique()[0]
        annotations = df[df["id"] == id]["annotation"].tolist()
        
        patientsInfo_list.append(PatientInfoTuple(
            id,
            cell_type,
            annotations
        ))
        
    return patientsInfo_list

In [8]:
df = pd.read_csv('livecell_base_preprocessing.csv')

In [16]:
seg_list = list()
array_help = df[df['image_id']==1564017]["polygons"].to_numpy()
# print(type(df[df['image_id']=='1564017']['polygons']))
# print(type(array_help[0]))
for img_mask in array_help:
#     print(type(img_mask))
#     print(img_mask)
    img_mask = img_mask[2:-2]
    img_mask = np.array(img_mask.split())
#     print(type(img_mask))

    x = img_mask[0::2]
    y = img_mask[1::2]
    
    arr = [(x, y) for (x, y) in zip(y,x)]
    vertices = np.asarray(arr)
    path = Path(vertices)
    xmin, ymin, xmax, ymax = np.asarray(path.get_extents(), dtype=int).ravel()
    x, y = np.mgrid[:520, :704]
    
    # mesh grid to a list of points
    points = np.vstack((x.ravel(), y.ravel())).T

    # select points included in the path
    mask = path.contains_points(points)
    path_points = points[np.where(mask)]

    # reshape mask for display
    img_mask = mask.reshape(x.shape)
    img_mask = img_mask.astype(np.int)
    
#     print(img_mask.shape)
    # Crop into 4 images 260*352
    
    # ENCODED MASK
    encoded_img_mask = rle_encode(img_mask)
    seg_list.append(encoded_img_mask)


seg_list[0]

(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)
(520, 704)

'89838 4 90541 6 91245 8 91949 9 92653 11 93357 12 94062 11 94766 11 95471 10 96175 10 96879 9 97583 8 98287 7 98991 7 99695 6 100399 6 101103 6 101807 6 102511 6 103215 6 103919 7 104623 8 105327 8 106031 9 106735 10 107440 9 108144 10 108848 5 108854 5 109552 5 109559 4 110256 5 110263 5 110960 5 110968 4 111664 5 111672 5 112368 5 112377 4 113073 4 113082 3 113780 1 113786 3 114490 3 115195 2 115899 3 116604 2 117308 2 118013 2 118717 2 119421 3'

### Convert JSON to DataFrame

In [133]:
# TODO
# Add info about data of creation of sample and other important info

In [11]:
def create_livecell_df_rows(annotations :list[dict], subsets :list[str]=['train', 'val', 'test']) -> list[dict]:
    """
    Create LIVECell List{Dict], where:
    
    'image_id': id of an image in the dataset
    'cell_type': name of cell type
    'width': width of an image
    'height': hitht of an image
    'file_name': file name of an image
    'file_path': absolute path to the image file
    'annotation_id': id of one (current) cell
    'category_id': cell class
    'polygons': segmentation polygons
    'area': area of a polygon
    'bbox': bounding box of a cell
    """
    df_rows = []

    for idx, (data, subset) in enumerate(zip(annotations, subsets)):
#         print(f'idx: {idx}, subset: {subset}')
    
        # Image Id to Image
        image_id2_image_dict = dict()
        for image in data['images']:
            image_id2_image_dict[image['id']] = image
    
        for annotation in tqdm(data['annotations']):
            image_id = annotation['image_id']
            image = image_id2_image_dict.get(image_id)
            # Image File Path
            file_name = image['file_name']
            file_name_split = file_name.split('_')
            cell_type = file_name_split[0]
            well = file_name_split[2]
            location = file_name_split[3]
            timestamp = file_name_split[4]
            crop = file_name_split[5][0]
            if subset in ['train', 'val']:
                file_path = f'/workspace/images/livecell_train_val_images/{file_name}'
            else:
                file_path = f'/workspace/images/livecell_test_images/{file_name}'
            
#             img_mask = img_mask[2:-2]
#             img_mask = np.array(img_mask.split())
#             print(type(img_mask))
            segmentation = annotation['segmentation'][0]
            x = segmentation[0::2]
            y = segmentation[1::2]
    
            arr = [(x, y) for (x, y) in zip(y,x)]
            vertices = np.asarray(arr)
            path = Path(vertices)
            xmin, ymin, xmax, ymax = np.asarray(path.get_extents(), dtype=int).ravel()
            x, y = np.mgrid[:520, :704]
    
            # mesh grid to a list of points
            points = np.vstack((x.ravel(), y.ravel())).T

            # select points included in the path
            mask = path.contains_points(points)
            path_points = points[np.where(mask)]

            # reshape mask for display
            img_mask = mask.reshape(x.shape)
            img_mask = img_mask.astype(np.int)
            
#             img_mask.shape()
            
            
            # ENCODED MASK
            encoded_img_mask = rle_encode(img_mask)
#             seg_list.append(encoded_img_mask)
    
            df_rows.append({
                'image_id': np.int32(image['id']),
                'cell_type': cell_type,
                'well': well,
                'location': location,
                'timestamp': timestamp,
                'crop': crop,
                'well_time': well + '_' + timestamp,
                'well_time_loc': well + '_' + timestamp + '_' + location,
                'well_time_loc_crop': well + '_' + timestamp + '_' + location + '_' + crop,
                'width': np.int16(image['width']),
                'height': np.int16(image['height']),
                'file_name': file_name,
                'file_path': file_path,
                'annotation_id': np.int32(annotation['id']),
                'category_id': np.int8(annotation['category_id']),
#                 'polygons': np.array(annotation['segmentation'], dtype=np.float32),
                'annotation': encoded_img_mask,
                'area': np.float32(annotation['area']),
                'bbox': np.array(annotation['bbox'], dtype=np.float32),
                'original_split': subset,
            })
    
    return df_rows

In [None]:
df_rows = create_livecell_df_rows(annotations)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1018576/1018576 [9:54:22<00:00, 28.56it/s]
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 164530/181610 [1:33:16<08:36, 33.05it/s]

In [None]:
# Create pandas DataFrame
df = pd.DataFrame.from_dict(df_rows)

In [137]:
df

Unnamed: 0,image_id,cell_type,well,location,timestamp,crop,well_time,well_time_loc,well_time_loc_crop,width,height,file_name,file_path,annotation_id,category_id,polygons,area,bbox,original_split
0,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,2,1,"[[288.02, 305.63, 286.01, 298.87, 286.01, 295....",307.478607,"[286.01, 287.73, 19.17, 20.27]",train
1,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,3,1,"[[271.22, 323.34, 267.93, 322.61, 266.29, 320....",247.475555,"[263.0, 304.9, 20.45, 18.44]",train
2,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,4,1,"[[284.91, 279.88, 289.85, 281.52, 293.31, 281....",245.229446,"[275.42, 277.14, 23.92, 17.16]",train
3,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,5,1,"[[260.86, 327.64, 258.19, 325.63, 255.25, 324....",574.213074,"[246.96, 280.72, 20.99, 54.0]",train
4,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,6,1,"[[241.75, 324.69, 239.61, 326.97, 236.27, 331....",296.311401,"[229.45, 302.91, 22.59, 32.75]",train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662442,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695077,1,"[[514.64, 409.25, 513.76, 410.13, 513.32, 411....",134.385406,"[512.44, 405.31, 13.17, 15.21]",test
1662443,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695078,1,"[[501.18, 403.55, 500.74, 405.16, 500.3, 406.0...",445.365601,"[468.27, 373.71, 34.96, 43.0]",test
1662444,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695079,1,"[[619.51, 485.02, 619.65, 485.9, 619.94, 487.0...",241.589600,"[608.54, 468.34, 17.69, 24.14]",test
1662445,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695080,1,"[[617.46, 472.59, 616.0, 472.59, 614.39, 472.5...",216.130646,"[595.96, 454.6, 25.01, 17.99]",test


In [138]:
# Total number of annotated images
len(df['image_id'].unique())

5387

In [139]:
len(df['well_time_loc_crop'].unique())

4456

In [140]:
df[df['well']=='A3'].count()

image_id              30327
cell_type             30327
well                  30327
location              30327
timestamp             30327
crop                  30327
well_time             30327
well_time_loc         30327
well_time_loc_crop    30327
width                 30327
height                30327
file_name             30327
file_path             30327
annotation_id         30327
category_id           30327
polygons              30327
area                  30327
bbox                  30327
original_split        30327
dtype: int64

In [141]:
df['location'].unique()

array(['2', '1'], dtype=object)

In [142]:
df['timestamp'].unique()

array(['00d04h00m', '02d12h00m', '01d16h00m', '02d16h00m', '04d00h00m',
       '01d04h00m', '03d00h00m', '01d00h00m', '01d12h00m', '02d04h00m',
       '05d00h00m', '00d12h00m', '02d00h00m', '04d16h00m', '04d12h00m',
       '03d12h00m', '00d00h00m', '03d16h00m', '04d04h00m', '00d16h00m',
       '03d04h00m', '00d20h00m', '02d08h00m', '01d20h00m', '01d08h00m',
       '00d08h00m', '02d20h00m', '03d08h00m', '03d20h00m'], dtype=object)

In [143]:
df['crop'].unique()

array(['3', '4', '1', '2'], dtype=object)

In [None]:
# CELL TYPES
CELL_TYPES = np.sort(df['cell_type'].unique())
print(f'CELL_TYPES: {CELL_TYPES}')

In [None]:
# Cell Type to Label Dictionary, "+ 1" sine 0 is reserved for background
CELL_TYPE2LABEL = dict([(name, i + 1) for i, name in enumerate(CELL_TYPES)])
df['label'] = df['cell_type'].apply(CELL_TYPE2LABEL.get).astype(np.int8)

In [146]:
df

Unnamed: 0,image_id,cell_type,well,location,timestamp,crop,well_time,well_time_loc,well_time_loc_crop,width,height,file_name,file_path,annotation_id,category_id,polygons,area,bbox,original_split,label
0,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,2,1,"[[288.02, 305.63, 286.01, 298.87, 286.01, 295....",307.478607,"[286.01, 287.73, 19.17, 20.27]",train,2
1,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,3,1,"[[271.22, 323.34, 267.93, 322.61, 266.29, 320....",247.475555,"[263.0, 304.9, 20.45, 18.44]",train,2
2,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,4,1,"[[284.91, 279.88, 289.85, 281.52, 293.31, 281....",245.229446,"[275.42, 277.14, 23.92, 17.16]",train,2
3,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,5,1,"[[260.86, 327.64, 258.19, 325.63, 255.25, 324....",574.213074,"[246.96, 280.72, 20.99, 54.0]",train,2
4,1,BT474,A3,2,00d04h00m,3,A3_00d04h00m,A3_00d04h00m_2,A3_00d04h00m_2_3,704,520,BT474_Phase_A3_2_00d04h00m_3.tif,/workspace/images/livecell_train_val_images/BT...,6,1,"[[241.75, 324.69, 239.61, 326.97, 236.27, 331....",296.311401,"[229.45, 302.91, 22.59, 32.75]",train,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662442,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695077,1,"[[514.64, 409.25, 513.76, 410.13, 513.32, 411....",134.385406,"[512.44, 405.31, 13.17, 15.21]",test,6
1662443,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695078,1,"[[501.18, 403.55, 500.74, 405.16, 500.3, 406.0...",445.365601,"[468.27, 373.71, 34.96, 43.0]",test,6
1662444,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695079,1,"[[619.51, 485.02, 619.65, 485.9, 619.94, 487.0...",241.589600,"[608.54, 468.34, 17.69, 24.14]",test,6
1662445,1694938,SHSY5Y,A10,2,00d04h00m,2,A10_00d04h00m,A10_00d04h00m_2,A10_00d04h00m_2_2,704,520,SHSY5Y_Phase_A10_2_00d04h00m_2.tif,/workspace/images/livecell_test_images/SHSY5Y_...,1695080,1,"[[617.46, 472.59, 616.0, 472.59, 614.39, 472.5...",216.130646,"[595.96, 454.6, 25.01, 17.99]",test,6


In [147]:
df['label'].describe()

count    1.662447e+06
mean     4.735630e+00
std      2.161760e+00
min      1.000000e+00
25%      3.000000e+00
50%      5.000000e+00
75%      6.000000e+00
max      8.000000e+00
Name: label, dtype: float64

In [None]:
df.to_csv('livecell_base_preprocessing_rle.csv')