In [1]:
from __future__ import annotations
import json
from tqdm import tqdm
import pandas as pd
import numpy as np

### Reading TRAIN, VAL, TEST JSON

In [2]:
def read_json_annotations(train_path: str, val_path: str, test_path: str) -> list[dict]:
    """
    Reading Json annotations of LIVECell Dataset.
    """
    print('Reading train')
    with open(train_path, 'r') as f:
        train_annotations = json.load(f)
#         print(type(train_annotations))
        

    print('Reading val')
    with open(val_path, 'r') as f:
        val_annotations = json.load(f)

    print('Reading test')    
    with open(test_path, 'r') as f:
        test_annotations = json.load(f)
        
    return [train_annotations, val_annotations, test_annotations]

In [3]:
train_json_path = '/workspace/annotations/LIVECell/livecell_coco_train.json'
val_json_path = '/workspace/annotations/LIVECell/livecell_coco_val.json'
test_json_path = '/workspace/annotations/LIVECell/livecell_coco_test.json'

# train_annotations, val_annotations, test_annotations
annotations = read_json_annotations(train_json_path, val_json_path, test_json_path)

Reading train
Reading val
Reading test


In [7]:
def mask_decode(mask):
    array = np.zeros((520, 704))
    for label in mask:
        s = label.split()
        starts = list(map(lambda x: int(x) - 1, s[0::2]))
        lengths = list(map(int, s[1::2]))
        ends = [x + y for x, y in zip(starts, lengths)]
        img = np.zeros((520*704), dtype=np.float32)            
        for start, end in zip(starts, ends):
            img[start : end] = 1 
        array += img.reshape((520, 704))
    return array.clip(0, 1)

def rle_encode(img):
    """ TBD
    
    Args:
        img (np.array): 
            - 1 indicating mask
            - 0 indicating background
    
    Returns: 
        run length as string formated
    """
    
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

PatientInfoTuple = namedtuple(
    'PatientInfoTuple',
    'id, cell_type, annotations'
)

def getPatientsInfo():
    df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
    patientsInfo_list = list()
    
    for id in df.id.unique():
        cell_type = df[df.id == id].cell_type.unique()[0]
        annotations = df[df["id"] == id]["annotation"].tolist()
        
        patientsInfo_list.append(PatientInfoTuple(
            id,
            cell_type,
            annotations
        ))
        
    return patientsInfo_list

In [None]:
seg_list = list()
array_help = df[df['image_id']==1564017]["polygons"].to_numpy()
# print(type(df[df['image_id']=='1564017']['polygons']))
print(type(array_help[0]))
for img_mask in array_help:
#     print(type(img_mask))
#     print(img_mask)
    img_mask = img_mask[2:-2]
    img_mask = np.array(img_mask.split())
    print(type(img_mask))

    x = img_mask[0::2]
    y = img_mask[1::2]
    
    arr = [(x, y) for (x, y) in zip(y,x)]
    vertices = np.asarray(arr)
    path = Path(vertices)
    xmin, ymin, xmax, ymax = np.asarray(path.get_extents(), dtype=int).ravel()
    x, y = np.mgrid[:520, :704]
    
    # mesh grid to a list of points
    points = np.vstack((x.ravel(), y.ravel())).T

    # select points included in the path
    mask = path.contains_points(points)
    path_points = points[np.where(mask)]

    # reshape mask for display
    img_mask = mask.reshape(x.shape)
    img_mask = img_mask.astype(np.int)
    # ENCODED MASK
    encoded_img_mask = rle_encode(img_mask)
    seg_list.append(encoded_img_mask)


seg_list[0]

### Convert JSON to DataFrame

In [None]:
# TODO
# Add info about data of creation of sample and other important info

In [None]:
def create_livecell_df_rows(annotations :list[dict], subsets :list[str]=['train', 'val', 'test']) -> list[dict]:
    """
    Create LIVECell List{Dict], where:
    
    'image_id': id of an image in the dataset
    'cell_type': name of cell type
    'width': width of an image
    'height': hitht of an image
    'file_name': file name of an image
    'file_path': absolute path to the image file
    'annotation_id': id of one (current) cell
    'category_id': cell class
    'polygons': segmentation polygons
    'area': area of a polygon
    'bbox': bounding box of a cell
    """
    df_rows = []

    for idx, (data, subset) in enumerate(zip(tqdm(annotations), subsets)):
#         print(f'idx: {idx}, subset: {subset}')
    
        # Image Id to Image
        image_id2_image_dict = dict()
        for image in data['images']:
            image_id2_image_dict[image['id']] = image
    
        for annotation in tqdm(data['annotations']):
            image_id = annotation['image_id']
            image = image_id2_image_dict.get(image_id)
            # Image File Path
            file_name = image['file_name']
            file_name_split = file_name.split('_')
            cell_type = file_name_split[0]
            well = file_name_split[2]
            location = file_name_split[3]
            timestamp = file_name_split[4]
            crop = file_name_split[5][0]
            if subset in ['train', 'val']:
                file_path = f'/workspace/images/livecell_train_val_images/{file_name}'
            else:
                file_path = f'/workspace/images/livecell_test_images/{file_name}'
            
            df_rows.append({
                'image_id': np.int32(image['id']),
                'cell_type': cell_type,
#                 'well': well,
#                 'location': location,
#                 'timestamp': timestamp,
#                 'crop': crop,
#                 'well_time': well + '_' + timestamp,
#                 'well_time_loc': well + '_' + timestamp + '_' + location,
#                 'well_time_loc_crop': well + '_' + timestamp + '_' + location + '_' + crop,
                'width': np.int16(image['width']),
                'height': np.int16(image['height']),
                'file_name': file_name,
#                 'file_path': file_path,
                'annotation_id': np.int32(annotation['id']),
                'category_id': np.int8(annotation['category_id']),
                'polygons': np.array(annotation['segmentation'], dtype=np.float32),
#                 'annotation': encoded_img_mask,
                'area': np.float32(annotation['area']),
                'bbox': np.array(annotation['bbox'], dtype=np.float32),
                'original_split': subset,
            })
    
    return df_rows

In [None]:
df_rows = create_livecell_df_rows(annotations)

In [None]:
# Create pandas DataFrame
df = pd.DataFrame.from_dict(df_rows)

In [None]:
df

In [None]:
# Total number of annotated images
len(df['image_id'].unique())

In [None]:
len(df['well_time_loc_crop'].unique())

In [None]:
df[df['well']=='A3'].count()

In [None]:
df['location'].unique()

In [None]:
df['timestamp'].unique()

In [None]:
df['crop'].unique()

In [None]:
# Cell Type to Label Dictionary, "+ 1" sine 0 is reserved for background
CELL_TYPE2LABEL = dict([(name, i + 1) for i, name in enumerate(CELL_TYPES)])
df['label'] = df['cell_type'].apply(CELL_TYPE2LABEL.get).astype(np.int8)

In [None]:
df

In [None]:
df['label'].describe()

In [None]:
df.to_csv('livecell_base_preprocessing.csv')

## Bbox Sanity Check

In [14]:
df = pd.read_csv('livecell_base_preprocessing_rle.csv')

In [17]:
df["bbox"] = df["bbox"].str[1:-1]
# [364.5894775390625, 798.4615478515625, 383.0497131347656, 798.4615478515625]
df['bbox_sanity'] = df['bbox'].apply(lambda x: True if float(x.split()[2])/3 > 1 or float(x.split()[3])/3 > 1 else False)

In [18]:
df['bbox_sanity'].describe()

count     1662447
unique          2
top          True
freq      1662442
Name: bbox_sanity, dtype: object

In [None]:
df = df.drop_duplicates()

In [None]:
df.count()