### Init

**Load modules**

In [1]:
# Libraries
import os
import csv
import pandas as pd

**Globals**

In [2]:
# Directories
DATA_DIR    = '../../data'
IMAGES_DIR  = f'{DATA_DIR}/images'

# Metadata JSON files
TRAIN_JSON_FILEPATH  = f'{DATA_DIR}/training.json'
TEST_JSON_FILEPATH   = f'{DATA_DIR}/test.json'

# Images info CSV file
ALL_DATA_CSV_FILEPATH = f'{DATA_DIR}/all-data.csv'

### Dataset load

**Load dataset**

In [3]:
# Load images
images : list[str] = os.listdir(IMAGES_DIR)

# Read training set json images info file
train_df    : pd.DataFrame = pd.read_json(TRAIN_JSON_FILEPATH)

# Read test set json images info file
test_df     : pd.DataFrame = pd.read_json(TEST_JSON_FILEPATH)

train_df

Unnamed: 0,image,objects
0,{'checksum': '676bb8e86fc2dbf05dd97d51a64ac0af...,"[{'bounding_box': {'minimum': {'r': 1057, 'c':..."
1,{'checksum': '1225a18efce159eddf7b0e80e0ea642c...,"[{'bounding_box': {'minimum': {'r': 734, 'c': ..."
2,{'checksum': '3eaf840523c30fdf38897ffa01e194eb...,"[{'bounding_box': {'minimum': {'r': 724, 'c': ..."
3,{'checksum': '8a111dffacfa433029492780b9535091...,"[{'bounding_box': {'minimum': {'r': 563, 'c': ..."
4,{'checksum': 'ccef403e971460b86444cca669e68ca1...,"[{'bounding_box': {'minimum': {'r': 618, 'c': ..."
...,...,...
1203,{'checksum': 'd6ccf3550df717f6852d3c8266a1a590...,"[{'bounding_box': {'minimum': {'r': 1051, 'c':..."
1204,{'checksum': 'd06843db5b2a68cdaffedf7da7d12454...,"[{'bounding_box': {'minimum': {'r': 372, 'c': ..."
1205,{'checksum': 'e24d3a0f96233892e3a55f57fd6c0f5f...,"[{'bounding_box': {'minimum': {'r': 580, 'c': ..."
1206,{'checksum': '48c95e8a3623637d71590b376b34b4cc...,"[{'bounding_box': {'minimum': {'r': 160, 'c': ..."


**Split dataset**

In [4]:
# Split and store images' paths according to json files
train_imgs : list[str] = []
test_imgs  : list[str] = []

for record in train_df['image']:
    train_imgs.append(record['pathname'].split('/')[-1])
    
for record in test_df['image']:
    test_imgs.append(record['pathname'].split('/')[-1])
    
print(f'Train images count: {len(train_imgs)} | {round((len(train_imgs) / len(images) * 100), 2):>4}%')
print(f'Test images count:  {len(test_imgs)}  | {round((len(test_imgs) / len(images) * 100), 2):>4}%')

Train images count: 1208 | 90.96%
Test images count:  120  | 9.04%


**Class definitions**

In [5]:
# Object classification labels (cell type)
class_dict = {
    'red blood cell':0,
    'trophozoite': 1, 
    'schizont': 2, 
    'difficult': 3, 
    'ring': 4,
    'leukocyte': 5, 
    'gametocyte': 6
}

print('Classes:')
for name, index in class_dict.items():
    print(f' - {name}')

Classes:
 - red blood cell
 - trophozoite
 - schizont
 - difficult
 - ring
 - leukocyte
 - gametocyte


### Input data organization

**Images info extraction**

In [6]:
img_info = []


def set_info_dump(dataset : pd.DataFrame) -> None:
    for row in range(dataset.shape[0]):
        
        path = f'{DATA_DIR}' + dataset.iloc[row]['image']['pathname']
        
        for info in dataset.iloc[row]['objects']:
            
            category = info['category']
            label = class_dict[category]
            bounding_box = info['bounding_box']
            max_r = bounding_box['maximum']['r']
            max_c = bounding_box['maximum']['c']
            min_r = bounding_box['minimum']['r']
            min_c = bounding_box['minimum']['c']
            
            box_dim = max_r, max_c, min_r, min_c, category, label, path
            img_info.append(box_dim)

# Training data info
set_info_dump(train_df)

# Test data info
set_info_dump(test_df)

# CSV file headers - `box_dim` fields
headers = ['max_r', 'max_c', 'min_r', 'min_c', 'category', 'label', 'img_path']

# Images info store in csv
with open(ALL_DATA_CSV_FILEPATH, 'w') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(headers)
    writer.writerows(img_info)
    
# Load images info to pandas DataFrame from csv
data_df : pd.DataFrame = pd.read_csv(ALL_DATA_CSV_FILEPATH)

data_df

Unnamed: 0,max_r,max_c,min_r,min_c,category,label,img_path
0,1158,1540,1057,1440,red blood cell,0,../../data/images/8d02117d-6c71-4e47-b50a-6cc8...
1,971,1403,868,1303,red blood cell,0,../../data/images/8d02117d-6c71-4e47-b50a-6cc8...
2,689,1008,578,900,red blood cell,0,../../data/images/8d02117d-6c71-4e47-b50a-6cc8...
3,408,713,304,611,red blood cell,0,../../data/images/8d02117d-6c71-4e47-b50a-6cc8...
4,312,1003,198,881,red blood cell,0,../../data/images/8d02117d-6c71-4e47-b50a-6cc8...
...,...,...,...,...,...,...,...
86030,1145,787,1047,694,red blood cell,0,../../data/images/887cc81a-bae3-4360-a115-23ae...
86031,617,1464,543,1381,red blood cell,0,../../data/images/887cc81a-bae3-4360-a115-23ae...
86032,1196,1880,1041,1725,ring,4,../../data/images/887cc81a-bae3-4360-a115-23ae...
86033,653,508,526,374,ring,4,../../data/images/887cc81a-bae3-4360-a115-23ae...
