In [1]:
import os
import numpy as np
import pandas as pd
from scipy import io
from scipy import ndimage
from sklearn.model_selection import train_test_split

seed = 7655
np.random.seed(seed)

In [2]:
# split_labels returns a split of the label around a '_'
# eg. split_labels(abcd_efg_h) = abcd, efg_h
# first return is a guaranteed to be a valid string, second return is -1 if there is no '_' present, else it is a valid string
# eg. split_labels(abcdef) = abcdef, -1
def split_labels(old_label):
    i = old_label.find('_')
    if i == -1:
        return old_label, -1
    else:
        return old_label[:i], old_label[i+1:]

In [3]:
# discard anything not ice
# enter basic habit as string
# enter physical process as one hot encoding:
# 0 -> pristine
# 1 -> aggregate
# 2 -> rimed
# 3 -> aged
# return as a list, empty list if not ice

# Column
# Plate
# Droplet
# Lollipop
# Irregular
# Small
# Plate_column

def encode_label(label):
    basic_habit_dict = { 
        "Column": "Column",
        "column": "Column",
        "Plate": "Plate",
        "Plates": "Plate",
        "Droplet": "Droplet",
        "Droplets": "Droplet",
        "Droplet Lollipop": "Lollipop",
        "lollipop": "Lollipop",
        "Irregular": "Irregular",
        "Small": "Small",
        "Plate Column": "Plate Column",

        "Droplet Frozen": "NoClass",
        
        "Rosette": "NoClass",
        "Dendrite": "NoClass",
        "Graupel": "NoClass",
        "Branch": "NoClass",
        "Out": "NoClass", # out of focus
        "Recirculation": "NoClass",
        "Others": "NoClass",
    }
    physical_process_dict = {
        "pristine": 0,
        "aggregate": 1,
        "Aggregate": 1,
        "rimed": 2,
        "aged": 3,
    }
    
    original_label = label # for debugging, delete later
    #print(original_label)
    label_list = []
    particle_label, label = split_labels(label)
    if particle_label != "Ice" or label == -1: # not Ice or no basic habit defined
        return []
    
    habit_label, label = split_labels(label)
    if habit_label not in basic_habit_dict:
        print(habit_label, "not in habit dictionary", original_label)
        return []
    
    if basic_habit_dict[habit_label] == "NoClass":
        return []
    
    # extra clause because plate column are sometimes labeled as "plates_columns", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Plates" or habit_label == "plates") and label[:6] == "column":
        _, label = split_labels(label)
        habit_label = "Plate Column"
    # extra clause because Droplet lollipop column are sometimes labeled as "Droplet_lollipop", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Droplet" or habit_label == "Droplets") and label[:8] == "lollipop":
        _, label = split_labels(label)
        habit_label = "Droplet Lollipop"
    # extra clause because Droplet lollipop column are sometimes labeled as "Droplet_lollipop", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Droplet" or habit_label == "Droplets") and label[:6] == "frozen":
        return []
    
    physical_process_list = [0,0,0,0]
    label_list.append(basic_habit_dict[habit_label])
    if label == -1: # no physical process defined -> pristine
        physical_process_list[0] = 1
        label_list.append(physical_process_list)
        return label_list
    
    process_label, label = split_labels(label)
    if process_label not in physical_process_dict:
        print(process_label, habit_label, "not in physical process dictionary ", original_label)
        return []  #empty list 
    
    while label != -1:
        physical_process_list[physical_process_dict[process_label]] = 1
        process_label, label = split_labels(label)
    physical_process_list[physical_process_dict[process_label]] = 1
    
    # perform some checks:
    # at least one physical process is set ( if no process, then pristine should be set)
    assert physical_process_list[0] or physical_process_list[1] or physical_process_list[2] or physical_process_list[3] or physical_process_list[4] or physical_process_list[5]
    # either not pristine, or not any of the others
    assert physical_process_list[0] == 0 or (not physical_process_list[1] and not physical_process_list[2] and not physical_process_list[3] and not physical_process_list[4] and not physical_process_list[5])
    label_list.append(physical_process_list)
    return label_list

In [4]:
def load_data(number_parts):
    directory = os.getcwd()
    file_paths = []
    for i in range(number_parts):        
        #file_paths.append(directory + "/../data/huiying_labeled/uncropped_oldLabelling/huiying_uncropped_part"+str(i+1)+".mat")
        file_paths.append(directory + "/../data/huiying_labeled/cropped_oldLabelling/huiying_part"+str(i+1)+".mat")
    
    # load data like in holosuite/predict_pipeline/preprocessing.py
    data_index = []
    data_dict = {"img": [], "label":[]}
    
    for file_path in file_paths:
        print(file_path)

        var = io.whosmat(file_path)[0][0]
        mat = io.loadmat(file_path, squeeze_me=True, struct_as_record=False)
        ids = mat[var].prtclID
        imgs = mat[var].prtclIm
        labels = mat[var].cpType

        for j, (id_, img, label) in enumerate(zip(ids, imgs, labels)):
            # Check for missing IDs
            if isinstance(id_, np.ndarray):
                id_ = "no_id_{}".format(j + 1)
                print("prtclID missing in row {}, substituting with '{}'".format(j + 1, id_))
            # Check for duplicate IDs
            if id_ in data_index:
                print("ID {} not unique".format(id_))

            label = encode_label(label)
            if len(label) != 0:
                data_index.append(id_)
                data_dict["label"].append(label)
                data_dict["img"].append(img)

    dataset = pd.DataFrame(data_dict, index=data_index)
    print("Done loading")
    return dataset

In [5]:
# from holosuite/predict_pipeline/preprocessing.py
# added a border of 10% of max(img.shape)
def preprocess_img(img, img_size=-1):
    """Preprocess an image.
    
    The pixels of the image with NaN value are replaced with zeros. If `img_size` is not negative, the image is then
    zero-padded to square shape and then scaled to `img_size`x`img_size`.
    
    Args:
        img: An image array.
        img_size: Dimensions of the output image (`img_size`x`img_size`), use -1 to keep the original dimensions.
        
    Returns:
        The preprocessed image array.
    """
    # Replace NaN entries with 0
    img = np.nan_to_num(img)
    if img_size < 0:
        # Keep original size
        prep_img = img
    else:
        # Pad and scale images
        max_dim = max(img.shape)# + int(np.ceil(0.1*max(img.shape)))
        pad_shape = (max_dim - img.shape[0], max_dim - img.shape[1])
        pad_h_t = pad_shape[0]//2
        pad_h_b = pad_shape[0]//2 + pad_shape[0]%2
        pad_w_l = pad_shape[1]//2
        pad_w_r = pad_shape[1]//2 + pad_shape[1]%2
        square_img = np.pad(img, ((pad_h_t, pad_h_b), (pad_w_l, pad_w_r)), "constant", constant_values=0)
        prep_img = ndimage.zoom(square_img, img_size/max_dim)
    
    return prep_img

In [6]:
def preprocess_data(data):
    img_size = 128
    data["shape"] = data["img"].map(lambda img: img.shape)
    data["size"] = data["shape"].map(lambda shape: max(shape))
    data["img_abs"] = data["img"].map(lambda img: preprocess_img(np.absolute(img), img_size))
    data["img_ang"] = data["img"].map(lambda img: preprocess_img(np.angle(img), img_size))
    data["label_habit"] = data["label"].map(lambda label: label[0])
    data["label_proc_pristine"] = data["label"].map(lambda label: label[1][0])
    data["label_proc_aggregate"] = data["label"].map(lambda label: label[1][1])
    data["label_proc_rimed"] = data["label"].map(lambda label: label[1][2])
    data["label_proc_aged"] = data["label"].map(lambda label: label[1][3])
    data = data.drop(columns=['label', 'img'])
    return data

In [7]:
# change image size in function preprocess data (now 128x128)
data = preprocess_data(load_data(3))

/home/bkrumme/bachelors-thesis-bkrumme/source/../data/huiying_labeled/cropped_oldLabelling/huiying_part1.mat
/home/bkrumme/bachelors-thesis-bkrumme/source/../data/huiying_labeled/cropped_oldLabelling/huiying_part2.mat
/home/bkrumme/bachelors-thesis-bkrumme/source/../data/huiying_labeled/cropped_oldLabelling/huiying_part3.mat
Done loading


In [8]:
train, test = train_test_split(data, stratify=data["label_proc_rimed"], test_size=0.2, random_state=636)

In [9]:
# Print physical process distribution over test and train set
print("Train:")
print(train[["label_proc_pristine","label_proc_aggregate","label_proc_rimed","label_proc_aged"]].mean())
print("\nTest:")
print(test[["label_proc_pristine","label_proc_aggregate","label_proc_rimed","label_proc_aged"]].mean())

Train:
label_proc_pristine     0.564491
label_proc_aggregate    0.164206
label_proc_rimed        0.107550
label_proc_aged         0.220539
dtype: float64

Test:
label_proc_pristine     0.569283
label_proc_aggregate    0.160321
label_proc_rimed        0.107485
label_proc_aged         0.218078
dtype: float64


In [10]:
# Print basic habit distribution over test and train set
print("Train:")
habits_train = np.array(train['label_habit'].str.split(expand=True).stack().value_counts())
n_train = len(train['label_habit'])
print(np.around(habits_train/n_train, decimals=3))

print("Test:")
habits_test = np.array(test['label_habit'].str.split(expand=True).stack().value_counts())
n_test = len(test['label_habit'])
print(np.around(habits_test/n_test, decimals=3))

Train:
[0.705 0.161 0.117 0.066 0.033 0.011]
Test:
[0.706 0.163 0.119 0.064 0.037 0.01 ]


In [11]:
# Save as pickle
#test.to_pickle("../data/test/uncropped_oldLabelling/test_set_128px.pkl")
#train.to_pickle("../data/train/uncropped_oldLabelling/train_set_128px.pkl")
test.to_pickle("../data/test/cropped_oldLabelling/test_set_128px.pkl")
train.to_pickle("../data/train/cropped_oldLabelling/train_set_128px.pkl")