# Playground

This file is for testing and trying out stuff. I'm going to put the pipeline later into simple .py files.

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import io
from scipy import ndimage
from PIL import Image as Img
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt


np.random.seed(42)

In [6]:
print("Loading and merging files")
data = load_data()
print("Data ready")

Loading and merging files
/home/beni/Documents/BachelorArbeit/bachelors-thesis-bkrumme/source/../data/huiying_labeled/huiying_part1.mat
Data ready


In [7]:
print("Preprocessing the dataset")
img_size = 64
data["shape"] = data["img"].map(lambda img: img.shape)
data["size"] = data["shape"].map(lambda shape: max(shape))
data["img_abs"] = data["img"].map(lambda img: preprocess_img(np.absolute(img), img_size))
data["img_ang"] = data["img"].map(lambda img: preprocess_img(np.angle(img), img_size))
data["label_habit"] = data["label"].map(lambda label: label[0])
data["label_proc_pristine"] = data["label"].map(lambda label: label[1][0])
data["label_proc_aggregate"] = data["label"].map(lambda label: label[1][1])
data["label_proc_rimed"] = data["label"].map(lambda label: label[1][2])
data["label_proc_aged"] = data["label"].map(lambda label: label[1][3])
data["label_proc_frozen"] = data["label"].map(lambda label: label[1][4])
data = data.drop(columns=['label', 'img'])

data.head(2)

Preprocessing the dataset


Unnamed: 0,shape,size,img_abs,img_ang,label_habit,label_proc_pristine,label_proc_aggregate,label_proc_rimed,label_proc_aged,label_proc_frozen
2019-11-11-15-50-03-120775_+0.74_+2.41_+36.00_+343.7,"(146, 112)",146,"[[1.29207e-40, 3.230577e-36, 7.463587e-35, 4.9...","[[4.68157e-40, 2.6673643e-36, 6.1370224e-35, 4...",Column,0,0,0,1,0
2019-11-11-15-50-08-635363_-4.28_+1.28_+69.80_+827.2,"(282, 330)",330,"[[-0.0, -0.0, 0.0, -0.0, 0.0, -3.5e-44, 9.5734...","[[0.0, 0.0, -0.0, 0.0, -0.0, -3.8e-44, -8.2e-4...",Irregular,0,0,0,1,0


In [9]:
# reduces dataframe to only contain a single class (deletes the rest)
def return_habit(data, habit):
    data = data.drop(data[data.label_habit != habit].index)
    return data

Columns = return_habit(data, "Column")
def return_process(data, process):
    process_dict = {
        "pristine" : "label_proc_pristine",
        "aggregate" : "label_proc_aggregate",
        "rimed" : "label_proc_rimed",
        "aged" : "label_proc_aged",
        "frozen" : "label_proc_frozen"
    }
    data = data.drop(data[data[process_dict[process]] != 1].index)
    return data

# Cols_rimed = return_process(Columns, "rimed")

Columns.to_pickle("../data/pickle/Huiying_part1_columns.pkl")
# save as pkl or as df


In [27]:
def print_matrix(data):
    #matrix = np.zeros((5, 14))
    feature_list = ["Column","Plate","Droplet","Droplet Lollipop","Irregular","Rosette","Plate Column",
                    "Dendrite","Lollipop","Graupel","Branch","Small","Out of focus","Recirculation","Others"]
    physical_processes = ["pristine", "aggregate", "rimed", "aged", "frozen", "total"]
    matrix =  pd.DataFrame(0, index=feature_list, columns=physical_processes)
    
    for i in range(len(data.index)):
        habit = data["label_habit"][i]
        matrix["pristine"][habit] += data["label_proc_pristine"][i]
        matrix["aggregate"][habit] += data["label_proc_aggregate"][i]
        matrix["rimed"][habit] += data["label_proc_rimed"][i]
        matrix["aged"][habit] += data["label_proc_aged"][i]
        matrix["frozen"][habit] += data["label_proc_frozen"][i]
        matrix["total"][habit] += 1
    print(matrix)
    print("\nThe sum of all physical processes of a habit can be bigger than the total, \nsince an ice crystal can have multiple physical processes.")
    
    
print_matrix(data)

                  pristine  aggregate  rimed  aged  frozen  total
Column                1008        953    385  1342       0   3436
Plate                  124         36      0   275       0    400
Droplet                  0        332     27    71     546    955
Droplet Lollipop       142          0      0     0       0    142
Irregular              694         71    138   615       0   1448
Rosette                  7          0      0     0       0      7
Plate Column            70          0      0     0       0     70
Dendrite                 0          0      0     0       0      0
Lollipop                 0          0      0     8       0      8
Graupel                  1          0      0     0       0      1
Branch                   0          0      0     0       0      0
Small                  201          0      0     0       0    201
Out of focus            17          0      0     0       0     17
Recirculation          140          0      0     0       0    140
Others    

In [5]:

def show_imgs(data, *args):
    n = len(args)
    if n > 1:
        f, ax = plt.subplots(n,2,figsize=(15,25))
        for i, idx in enumerate(args):
            img_abs = data["img_abs"][idx]
            img_ang = data["img_ang"][idx]
            ax[i,0].imshow(Img.fromarray(img_abs*255/np.max(img_abs)).convert("RGB"))
            ax[i,1].imshow(Img.fromarray(img_ang*255/np.max(img_ang)).convert("RGB"))
    else:
        f,ax = plt.subplots(1,2,figsize=(15,25))
        img_abs = data["img_abs"][args[0]]
        img_ang = data["img_ang"][args[0]]
        ax[0].imshow(Img.fromarray(img_abs*255/np.max(img_abs)).convert("RGB"))
        ax[1].imshow(Img.fromarray(img_ang*255/np.max(img_ang)).convert("RGB"))

show_imgs(data,1,2,3,4,5,6)

NameError: name 'data' is not defined

In [58]:
def print_statistic(data):
    num_samples = len(data.index)   
    num_particle_ice = len(data[(data['label_particle'] == 0)].index)
    num_particle_water = len(data[(data['label_particle'] == 1)].index)
    num_particle_noIce = len(data[(data['label_particle'] == 2)].index)
    num_particle_artifact = len(data[(data['label_particle'] == 3)].index)
    num_particle_unsure = len(data[(data['label_particle'] == 4)].index)
    num_particle_unknown = len(data[(data['label_particle'] == 5)].index)
    
    print("### Particle Information ###")
    print("Total number of samples:", num_samples)    
    print("Ice:\t\t", num_particle_ice, "\t", str(round(num_particle_ice * 100 / num_samples,2))+"%")
    print("Water:\t\t", num_particle_water,"\t", str(round(num_particle_water * 100 / num_samples,2))+"%")
    print("noIce:\t\t", num_particle_noIce,"\t", str(round(num_particle_noIce * 100 / num_samples,2))+"%")
    print("Artifact:\t", num_particle_artifact,"\t", str(round(num_particle_artifact * 100 / num_samples,2))+"%")
    print("Unsure:\t\t", num_particle_unsure,"\t", str(round(num_particle_unsure * 100 / num_samples,2))+"%")
    print("Unknown:\t", num_particle_unknown,"\t", str(round(num_particle_unknown * 100 / num_samples,2))+"%")
    print("")
    
    num_habit_column = len(data[(data['label_habit'] == 0)].index)
    num_habit_plate = len(data[(data['label_habit'] == 1)].index)
    num_habit_droplet = len(data[(data['label_habit'] == 2)].index)
    num_habit_dropletLollipop = len(data[(data['label_habit'] == 3)].index)
    num_habit_irregular = len(data[(data['label_habit'] == 4)].index)
    num_habit_rosette = len(data[(data['label_habit'] == 5)].index)
    num_habit_plateColumn = len(data[(data['label_habit'] == 6)].index)
    num_habit_dendrite = len(data[(data['label_habit'] == 7)].index)
    num_habit_lollipop = len(data[(data['label_habit'] == 8)].index)
    num_habit_graupel = len(data[(data['label_habit'] == 9)].index)
    num_habit_branch = len(data[(data['label_habit'] == 10)].index)
    num_habit_small = len(data[(data['label_habit'] == 11)].index)
    num_habit_recirculation = len(data[(data['label_habit'] == 12)].index)
    num_habit_others = len(data[(data['label_habit'] == 13)].index)
    
    print("### Habit Information of Ice Particles ###")
    print("Total number of Ice particles: ", num_particle_ice)
    print("Column:\t\t", num_habit_column, "\t", str(round(num_habit_column * 100 / num_particle_ice,2))+"%")
    print("Plate:\t\t", num_habit_plate, "\t", str(round(num_habit_plate * 100 / num_particle_ice,2))+"%")
    print("Droplet:\t", num_habit_droplet, "\t", str(round(num_habit_droplet * 100 / num_particle_ice,2))+"%")
    print("Dropl.Lollipop:\t", num_habit_dropletLollipop, "\t", str(round(num_habit_dropletLollipop * 100 / num_particle_ice,2))+"%")
    print("Irregular:\t", num_habit_irregular, "\t", str(round(num_habit_irregular * 100 / num_particle_ice,2))+"%")
    print("Rosette:\t", num_habit_rosette, "\t", str(round(num_habit_rosette * 100 / num_particle_ice,2))+"%")
    print("PlateColumn:\t", num_habit_plateColumn, "\t", str(round(num_habit_plateColumn * 100 / num_particle_ice,2))+"%")
    print("Dendrite:\t", num_habit_dendrite, "\t", str(round(num_habit_dendrite * 100 / num_particle_ice,2))+"%")
    print("Lollipop:\t", num_habit_lollipop, "\t", str(round(num_habit_lollipop * 100 / num_particle_ice,2))+"%")
    print("Graupel:\t", num_habit_graupel, "\t", str(round(num_habit_graupel * 100 / num_particle_ice,2))+"%")
    print("Branch:\t\t", num_habit_branch, "\t", str(round(num_habit_branch * 100 / num_particle_ice,2))+"%")
    print("Small:\t\t", num_habit_small, "\t", str(round(num_habit_small * 100 / num_particle_ice,2))+"%")
    print("Recirculation:\t", num_habit_recirculation, "\t", str(round(num_habit_recirculation * 100 / num_particle_ice,2))+"%")
    print("Others:\t\t", num_habit_others, "\t", str(round(num_habit_others * 100 / num_particle_ice,2))+"%")
    print("")
    
    num_process_pristine = len(data[(data['label_proc_pristine'] == 1)].index)
    num_process_aggregate = len(data[(data['label_proc_aggregate'] == 1)].index)
    num_process_rimed = len(data[(data['label_proc_rimed'] == 1)].index)
    num_process_aged = len(data[(data['label_proc_aged'] == 1)].index)
    num_process_frozen = len(data[(data['label_proc_frozen'] == 1)].index)
    
    print("### Physical Process Information of Ice Particles ###")
    print("Pristine:\t", num_process_pristine, "\t", str(round(num_process_pristine * 100 / num_particle_ice,2))+"%")
    print("Aggregate:\t", num_process_aggregate, "\t", str(round(num_process_aggregate * 100 / num_particle_ice,2))+"%")
    print("Rimed:\t\t", num_process_rimed, "\t", str(round(num_process_rimed * 100 / num_particle_ice,2))+"%")
    print("Aged:\t\t", num_process_aged, "\t", str(round(num_process_aged * 100 / num_particle_ice,2))+"%")
    print("Frozen:\t\t", num_process_frozen, "\t", str(round(num_process_frozen * 100 / num_particle_ice,2))+"%")
    
print_statistic(data)



### Particle Information ###
Total number of samples: 7200
Ice:		 6825 	 94.79%
Water:		 40 	 0.56%
noIce:		 171 	 2.38%
Artifact:	 140 	 1.94%
Unsure:		 4 	 0.06%
Unknown:	 20 	 0.28%

### Habit Information of Ice Particles ###
Total number of Ice particles:  6825
Column:		 3436 	 50.34%
Plate:		 400 	 5.86%
Droplet:	 955 	 13.99%
Dropl.Lollipop:	 142 	 2.08%
Irregular:	 1448 	 21.22%
Rosette:	 7 	 0.1%
PlateColumn:	 70 	 1.03%
Dendrite:	 0 	 0.0%
Lollipop:	 8 	 0.12%
Graupel:	 1 	 0.01%
Branch:		 0 	 0.0%
Small:		 201 	 2.95%
Recirculation:	 17 	 0.25%
Others:		 140 	 2.05%

### Physical Process Information of Ice Particles ###
Pristine:	 2387 	 34.97%
Aggregate:	 1392 	 20.4%
Rimed:		 550 	 8.06%
Aged:		 2311 	 33.86%
Frozen:		 546 	 8.0%


In [None]:
data.info(verbose=True)
data.head(2)

In [4]:
# from holosuite/predict_pipeline/preprocessing.py
def preprocess_img(img, img_size=-1):
    """Preprocess an image.
    
    The pixels of the image with NaN value are replaced with zeros. If `img_size` is not negative, the image is then
    zero-padded to square shape and then scaled to `img_size`x`img_size`.
    
    Args:
        img: An image array.
        img_size: Dimensions of the output image (`img_size`x`img_size`), use -1 to keep the original dimensions.
        
    Returns:
        The preprocessed image array.
    """
    # Replace NaN entries with 0
    img = np.nan_to_num(img)
    if img_size < 0:
        # Keep original size
        prep_img = img
    else:
        # Pad and scale images
        max_dim = max(img.shape)
        pad_shape = (max_dim - img.shape[0], max_dim - img.shape[1])
        pad_h_t = pad_shape[0]//2
        pad_h_b = pad_shape[0]//2 + pad_shape[0]%2
        pad_w_l = pad_shape[1]//2
        pad_w_r = pad_shape[1]//2 + pad_shape[1]%2
        square_img = np.pad(img, ((pad_h_t, pad_h_b), (pad_w_l, pad_w_r)), "constant", constant_values=0)
        prep_img = ndimage.zoom(square_img, img_size/max_dim)
    
    return prep_img

In [3]:
# discard anything not ice
# enter basic habit as string
# enter physical process as one hot encoding:
# 0 -> pristine
# 1 -> aggregate
# 2 -> rimed
# 3 -> aged
# 4 -> frozen
# return as a list, empty list if not ice

def encode_label(label):
    basic_habit_dict = { 
        "Column": "Column",
        "column": "Column",
        "Plate": "Plate",
        "Plates": "Plate",
        "Droplet": "Droplet",
        "Droplets": "Droplet",
        "Droplet Lollipop": "Droplet Lollipop",
        "Irregular": "Irregular",
        "Rosette": "Rosette",
        "Plate Column": "Plate Column",
        "Dendrite": "Dendrite",
        "lollipop": "Lollipop",
        "Graupel": "Graupel",
        "Branch": "Branch",
        "Small": "Small",
        "Out": "Out of focus", # out of focus
        "Recirculation": "Recirculation",
        "Others": "=thers",
        "Rimed": "Basic Habit: Rimed", # Julies data
        "Aged": "Basic Habit: Aged" # Julies data
    }
    physical_process_dict = {
        "pristine": 0,
        "aggregate": 1,
        "Aggregate": 1,
        "rimed": 2,
        "aged": 3,
        "frozen": 4
    }
    
    original_label = label # for debugging, delet later
    #print(original_label)
    label_list = []
    particle_label, label = split_labels(label)
    if particle_label != "Ice" or label == -1: # not Ice or no basic habit defined
        return []
    
    habit_label, label = split_labels(label)
    if habit_label not in basic_habit_dict:
        print(habit_label, "not in habit dictionary", original_label)
        return []
    # extra clause because Ice_Out_Of_Focus, the _ causes a problem with the destinction between habit and process
    if (habit_label == "Out"):
        _, label = split_labels(label)
        _, label = split_labels(label)
    # extra clause because plate column are sometimes labeled as "plates_columns", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Plates" or habit_label == "plates") and label[:6] == "column":
        _, label = split_labels(label)
        habit_label = "Plate Column"
    # extra clause because Droplet lollipop column are sometimes labeled as "Droplet_lollipop", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Droplet" or habit_label == "Droplets") and label[:8] == "lollipop":
        _, label = split_labels(label)
        habit_label = "Droplet Lollipop"
    
    physical_process_list = [0,0,0,0,0]
    label_list.append(basic_habit_dict[habit_label])
    if label == -1: # no physical process defined -> pristine
        physical_process_list[0] = 1
        label_list.append(physical_process_list)
        return label_list
    
    process_label, label = split_labels(label)
    if process_label not in physical_process_dict:
        print(process_label, "not in physical process dictionary ", original_label)
        return []  #empty list 
    
    while label != -1:
        physical_process_list[physical_process_dict[process_label]] = 1
        process_label, label = split_labels(label)
    physical_process_list[physical_process_dict[process_label]] = 1
    
    # perform some checks:
    # at least one physical process is set ( if no process, then pristine should be set)
    assert physical_process_list[0] or physical_process_list[1] or physical_process_list[2] or physical_process_list[3] or physical_process_list[4] or physical_process_list[5]
    # either not pristine, or not any of the others
    assert physical_process_list[0] == 0 or (not physical_process_list[1] and not physical_process_list[2] and not physical_process_list[3] and not physical_process_list[4] and not physical_process_list[5])
    label_list.append(physical_process_list)
    return label_list
    

    
# split_labels returns a split of the label around a '_'
# eg. split_labels(abcd_efg_h) = abcd, efg_h
# first return is a guaranteed to be a valid string, second return is -1 if there is no '_' present, else it is a valid string
# eg. split_labels(abcdef) = abcdef, -1
def split_labels(old_label):
    i = old_label.find('_')
    if i == -1:
        return old_label, -1
    else:
        return old_label[:i], old_label[i+1:]
    

print(encode_label("Ice_Small_frozen"))
print(encode_label("Ice_Plates_columns"))
print(encode_label("Ice_Droplet_lollipop"))
print(encode_label("Ice_Out_Of_Focus"))
print(encode_label("Ice_Droplet_aggregate"))
print(encode_label("Smt_Droplet_frozen"))

['Small', [0, 0, 0, 0, 1]]
['Plate Column', [1, 0, 0, 0, 0]]
['Droplet Lollipop', [1, 0, 0, 0, 0]]
['Out of focus', [1, 0, 0, 0, 0]]
['Droplet', [0, 1, 0, 0, 0]]
[]


In [27]:
def encode_label_old(label):
    # use a dicts to quickly add/change/merge habits and add new spellings of habits
    particle_dict = {
        "Ice": 0,
        "Water": 1,
        "No": "No Ice", # No_Ice
        "Artifact": "Artifact",
        "Unsure": "Unsure",
        "Unknown": "Unknown"       
    }
    basic_habit_dict = { 
        "Column": "Column",
        "column": "Column",
        "Plate": "Plate",
        "Plates": "Plate",
        "Droplet": "Droplet",
        "Droplets": "Droplet",
        "Droplet lollipop": "Droplet Lollipop",
        "Irregular": "Irregular",
        "Rosette": "Rosette",
        "Plate Column": "Plate Column",
        "Dendrite": "Dendrite",
        "lollipop": "Lollipop",
        "Graupel": "Graupel",
        "Branch": "Branch",
        "Small": "Small",
        "Out": "Out of focus", # out of focus
        "Recirculation": "Recirculation",
        "Others": "Others",
        "Rimed": "Basic Habit: Rimed", # Julies data
        "Aged": "Basic Habit: Aged" # Julies data
    }
    physical_process_dict = {
        "pristine": 0,
        "aggregate": 1,
        "Aggregate": 1,
        "rimed": 2,
        "aged": 3,
        "frozen": 4
    }

    original_label = label # for debugging purposes, delete later!
    
    # get num of distinguishable values in dictionary 
    num_physical_processes = len(set(physical_process_dict.values()))
    
    # determine particle type
    particle_label, label = split_labels(label)
    if particle_label not in particle_dict:
        print(particle_label, "not in particle dictionary", original_label)
        return -1, -1, np.full(shape=num_physical_processes, fill_value=-1, dtype=np.int)
    if label == -1 or particle_label == "No": # no basic habit defined or No_Ice (also no habit defined)
        return particle_dict[particle_label], -1,  np.full(shape=num_physical_processes, fill_value=-1, dtype=np.int)
    
    # determine habit
    habit_label, label = split_labels(label)
    if habit_label not in basic_habit_dict:
        print(habit_label, "not in habit dictionary", original_label)
        return particle_dict[particle_label], -1,  np.full(shape=num_physical_processes, fill_value=-1, dtype=np.int)
    if label == -1: # no physical process defined -> pristine
        physical_process_encoding = np.full(shape=num_physical_processes, fill_value=0, dtype=np.int)
        physical_process_encoding[physical_process_dict["pristine"]] = 1
        return particle_dict[particle_label], basic_habit_dict[habit_label], physical_process_encoding
    
    # extra clause because Ice_Out_Of_Focus, the _ causes a problem with the destinction between habit and process
    if (habit_label == "Out"):
        return particle_dict[particle_label], basic_habit_dict[habit_label], np.full(shape=num_physical_processes, fill_value=-1, dtype=np.int)
        
    # extra clause because plate column are sometimes labeled as "plates_columns", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Plates" or habit_label == "plates") and label[:6] == "column":
        _, label = split_labels(label)
        habit_label = "Plate Column"
        if label == -1: # no physical process defined -> pristine
            physical_process_encoding = np.full(shape=num_physical_processes, fill_value=0, dtype=np.int)
            physical_process_encoding[physical_process_dict["pristine"]] = 1
            return particle_dict[particle_label], basic_habit_dict[habit_label], physical_process_encoding

    # extra clause because Droplet lollipop column are sometimes labeled as "Droplet_lollipop", and the _ causes a problem with the destinction between habit and process
    if (habit_label == "Droplet" or habit_label == "Droplets") and label[:8] == "lollipop":
        _, label = split_labels(label)
        habit_label = "Droplet lollipop"
        if label == -1: # no physical process defined -> pristine
            physical_process_encoding = np.full(shape=num_physical_processes, fill_value=0, dtype=np.int)
            physical_process_encoding[physical_process_dict["pristine"]] = 1
            return particle_dict[particle_label], basic_habit_dict[habit_label], physical_process_encoding
        
    # determine physical processes -> can be multiple
    physical_process_encoding = np.full(shape=num_physical_processes, fill_value=0, dtype=np.int)
    process_label, label = split_labels(label)
    if process_label not in physical_process_dict:
        print(process_label, "not in physical process dictionary ", original_label)
        return particle_dict[particle_label], basic_habit_dict[habit_label], np.full(shape=num_physical_processes, fill_value=-1, dtype=np.int)  
    
    while label != -1:
        physical_process_encoding[physical_process_dict[process_label]] = 1
        process_label, label = split_labels(label)
    physical_process_encoding[physical_process_dict[process_label]] = 1

    return particle_dict[particle_label], basic_habit_dict[habit_label], physical_process_encoding


# split_labels returns a split of the label around a '_'
# eg. split_labels(abcd_efg_h) = abcd, efg_h
# first return is a guaranteed to be a valid string, second return is -1 if there is no '_' present, else it is a valid string
# eg. split_labels(abcdef) = abcdef, -1
def split_labels(old_label):
    i = old_label.find('_')
    if i == -1:
        return old_label, -1
    else:
        return old_label[:i], old_label[i+1:]

In [2]:
def load_data():
    directory = os.getcwd()
    file_paths = []
    file_paths.append(directory + "/../data/huiying_labeled/huiying_part1.mat")
    #file_paths.append(directory + "/../data/huiying_labeled/huiying_part2.mat")
    #file_paths.append(directory + "/../data/huiying_labeled/huiying_part3.mat")
    #file_paths.append(directory + "/../data/julie_labeled/julie_part1.mat")
    #file_paths.append(directory + "/../data/julie_labeled/julie_part2.mat")
    #file_paths.append(directory + "/../data/julie_labeled/julie_part3.mat")
    #file_path = directory + "/../data/huiying_labeled/huiying_part1.mat"

    # load data like in holosuite/predict_pipeline/preprocessing.py
    data_index = []
    data_dict = {"img": [], "label":[]}
    
    for file_path in file_paths:
        print(file_path)


        var = io.whosmat(file_path)[0][0]
        mat = io.loadmat(file_path, squeeze_me=True, struct_as_record=False)
        ids = mat[var].prtclID
        imgs = mat[var].prtclIm
        labels = mat[var].cpType

        for j, (id_, img, label) in enumerate(zip(ids, imgs, labels)):
            # Check for missing IDs
            if isinstance(id_, np.ndarray):
                id_ = "no_id_{}".format(j + 1)
                print("prtclID missing in row {}, substituting with '{}'".format(j + 1, id_))
            # Check for duplicate IDs
            if id_ in data_index:
                print("ID {} not unique".format(id_))

            
            #data_dict["label"].append(encode_label(label))
            
            label = encode_label(label)
            if len(label) != 0:
                data_index.append(id_)
                data_dict["label"].append(label)
                data_dict["img"].append(img)

    dataset = pd.DataFrame(data_dict, index=data_index)
    return dataset