In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from pathlib import Path
from skimage.io import imread
from skimage.color import label2rgb

from PIL import Image 
from collections import Counter
import os

#Download malaria dataset here:
#https://bbbc.broadinstitute.org/BBBC041

malaria_dir = Path('malaria/')


train_df = pd.read_json(malaria_dir / 'training.json')
train_df['path'] = train_df['image'].map(lambda x: malaria_dir / x['pathname'][1:])
train_df['image_available'] = train_df['path'].map(lambda x: x.exists())
train_df = train_df.query('image_available')

test_df = pd.read_json(malaria_dir / 'test.json')
test_df['path'] = test_df['image'].map(lambda x: malaria_dir / x['pathname'][1:])
test_df['image_available'] = test_df['path'].map(lambda x: x.exists())
test_df = test_df.query('image_available')

merge_df = train_df.append(test_df, ignore_index=True)

cat_dict = {'red blood cell': 0,'trophozoite': 1,'difficult': 2,'ring': 3,'schizont': 4,'gametocyte': 5,'leukocyte': 6}


def retrieve_info(in_row, ax=None):
    doc_li = []
    doc = str(in_row['path'])
    min_val_li = []
    max_val_li = []
    lab_id_li = []
    for c_obj in in_row['objects']:
        lab_id = cat_dict.get(c_obj['category'],20) # get label
        min_val = c_obj['bounding_box']['minimum']
        max_val = c_obj['bounding_box']['maximum']
        min_val_li.append(min_val)
        max_val_li.append(max_val)
        lab_id_li.append(lab_id)
        doc_li.append(doc)
    return doc_li, min_val_li, max_val_li, lab_id_li



def check_img_size(image_path):
    im=Image.open(image_path)
    w= int(im.size[0])
    h= int(im.size[1])
    return w, h 


def convert_to_yolo(size, box):
    dw = 1./size[0]
    dh = 1./size[1]
    x = (box[0] + box[1])/2.0
    y = (box[2] + box[3])/2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return x,y,w,h

red_blood_cell = 0
trophozoite = 0
difficult = 0
ring = 0
schizont = 0
gametocyte = 0
leukocyte = 0

for row in merge_df.iterrows():  #merge_df.iterrows():
    doc_li, min_val_li, max_val_li, lab_id_li = retrieve_info(row[1])
    df = pd.DataFrame(list(zip(doc_li, min_val_li, max_val_li, lab_id_li)), 
               columns =["doc_li", "min_val_li", "max_val_li", "lab_id_li"])
    dic = dict(Counter(df.lab_id_li))
    if 0 in dic:
        red_blood_cell+=dic[0]
    if 1 in dic:
        trophozoite+=dic[1]
    if 2 in dic:
        difficult+=dic[2]
    if 3 in dic:
        ring+=dic[3]
    if 4 in dic:
        schizont+=dic[4]
    if 5 in dic:
        gametocyte+=dic[5]
    if 6 in dic:
        leukocyte+=dic[6]

print("Cell stats:")
print("[0]red_blood_cell: ", red_blood_cell)
print("[1]trophozoite: ", trophozoite)
print("[2]difficult: ", difficult)
print("[3]ring: ", ring)
print("[4]schizont: ", schizont)
print("[5]gametocyte: ", gametocyte)
print("[6]leukocyte: ", leukocyte)

        
        

Cell stats:
[0]red_blood_cell:  83034
[1]trophozoite:  1584
[2]difficult:  446
[3]ring:  522
[4]schizont:  190
[5]gametocyte:  156
[6]leukocyte:  103


In [10]:
cat_dict = {'red blood cell': 0,'trophozoite': 1,'difficult': 2,'ring': 3,'schizont': 4,'gametocyte': 5,'leukocyte': 6}



In [11]:
#create labels in yolov5 format (4 numbers):
#class x y w h 
for row in merge_df.iterrows():  #merge_df.iterrows():
    doc_li, min_val_li, max_val_li, lab_id_li = retrieve_info(row[1])
    
    df = pd.DataFrame(list(zip(doc_li, min_val_li, max_val_li, lab_id_li)), 
               columns =["doc_li", "min_val_li", "max_val_li", "lab_id_li"]) 
    
    #exclude classes not related to malaria
    indexNames = df[(df['lab_id_li'] == 0)].index #red blood cell
    df.drop(indexNames , inplace=True)
    indexNames = df[(df['lab_id_li'] == 2)].index #exclude the class "difficult" to prevent False Positive, False Negative
    df.drop(indexNames , inplace=True)    
    indexNames = df[(df['lab_id_li'] == 6)].index #leukocyte
    df.drop(indexNames , inplace=True)

        
        
    if not df.empty:
        d_li = list(df["doc_li"])
        mi_val_li = list(df['min_val_li'])
        ma_val_li = list(df["max_val_li"])
        la_id_li = list(df["lab_id_li"])
        
        doc = os.path.splitext(d_li[0])[0]
        outF = open(str(doc)+".txt", "w") #all .txt will store in "/malaria/images/"
        for im_i in range(len(d_li)):
            w, h = check_img_size(d_li[0])
            b = (mi_val_li[im_i]['c'], ma_val_li[im_i]['c'], mi_val_li[im_i]['r'],  ma_val_li[im_i]['r'])
            x,y,w,h = convert_to_yolo((w,h), b)
            bb_label = la_id_li[im_i] #bb_label stores class ID, here we make following classes to be one class {malaria: 0}
            #{'trophozoite': 1,'ring': 3,'schizont': 4,'gametocyte': 5}           
#             strings = str(bb_label)+ str(' ')+str(x)+str(' ')+str(y)+str(' ')+str(w)+str(' ')+str(h)
            strings = str(0)+ str(' ')+str(x)+str(' ')+str(y)+str(' ')+str(w)+str(' ')+str(h) # yolov5 format (4 numbers)
            outF.write(strings)
            outF.write("\n")
        outF.close()