## Kaggle Retinal Lesions Segmentation

### Train labels

 Loading the dataset

In [173]:
from sklearn.model_selection import train_test_split
from shutil import copy, unpack_archive, rmtree

import pandas as pd
import numpy as np
import cv2
import os

In [174]:
def reset_path(directory:str):
    """
    Deletes an existing directory and re-creates it

    - directory: path to directory
    """
    if os.path.exists(directory):
        rmtree(directory)
    os.mkdir(directory)

In [175]:
DS_BASE_PATH = 'Data'
DB_PATH = 'DB'
reset_path(DB_PATH)

In [176]:
filename = 'retinal-lesions-v20191227'
ARCHIVE_FILE_PATH = os.path.join(DS_BASE_PATH, f'{filename}.zip')

rmtree(os.path.join(DS_BASE_PATH, filename))
unpack_archive(ARCHIVE_FILE_PATH, DS_BASE_PATH)

grading_csv_path = os.path.sep.join([DS_BASE_PATH,f'{filename}' ,'dr_grades.csv'])
grading_orig = pd.read_csv(grading_csv_path)
grading_orig.dropna()

segmentation_csv_path = os.path.sep.join([DS_BASE_PATH,f'{filename}' ,'segmentation_metadata.csv'])
segmentation_orig = pd.read_csv(segmentation_csv_path)
segmentation_orig.dropna()

display(grading_orig)
display(segmentation_orig)

Unnamed: 0,image id,kaggle label,our label
0,7384_right,2,2
1,27099_left,3,2
2,25531_right,2,2
3,34701_right,2,2
4,3166_right,2,1
...,...,...,...
1588,26615_right,2,2
1589,30283_left,2,2
1590,16602_right,2,1
1591,42897_left,2,2


Unnamed: 0,name,microaneurysm,retinal_hemorrhage,hard_exudate,cotton_wool_spots,neovascularization,vitreous_hemorrhage,preretinal_hemorrhage,fibrous_proliferation
0,10037_left,1,1,0,0,0,0,0,0
1,10047_right,1,1,1,0,0,0,0,0
2,1008_right,1,1,1,1,0,0,0,0
3,10091_right,1,1,1,0,0,0,0,0
4,10105_right,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1570,9893_right,1,1,1,0,0,0,0,0
1571,9944_left,1,0,0,0,0,0,0,0
1572,9990_right,1,1,0,0,0,0,0,0
1573,99_right,1,1,1,0,0,0,0,0


**NOTA MILTON**: Se debe de dividir las imágenes en training/val/test por la cantidad de lesiones, no por la cantidad de imágenes

In [177]:
import xmltodict

path = 'Data/retinal-lesions-v20191227/lesion_segs_896x896/localizacion'
columns = ['microaneurysm']
# lesion_file_dict = {}
dataset = {}

for col in segmentation_orig.columns[1:]:
    files = [f for f in os.listdir(path) if col in f]

    # display(files)

    elements = 0

    for f in files:
        img = f.split('__')[0]
        p = os.path.join(path, f)
        
        if img not in dataset.keys():
            dataset[img] = {}

        # if img not in lesion_file_dict.keys():
        #     lesion_file_dict[img] = {
        #         'name': img
        #     }

        with open(p) as fd:
            xml = xmltodict.parse(fd.read())
            lesions = len(xml['annotation']['object'])
            
            elements += lesions
            # lesion_file_dict[img][col] = lesions
            dataset[img][col] = xml['annotation']['object']
        
    
    display(f'{col}: {elements}')

'microaneurysm: 449'

'retinal_hemorrhage: 343'

'hard_exudate: 93'

'cotton_wool_spots: 62'

'neovascularization: 15'

'vitreous_hemorrhage: 0'

'preretinal_hemorrhage: 0'

'fibrous_proliferation: 0'

In [178]:
display(dataset)

{'1413_left': {'microaneurysm': [{'name': 'microaneurysm',
    'pose': 'Unspecified',
    'truncated': '0',
    'difficult': '0',
    'bndbox': {'xmin': '269', 'ymin': '297', 'xmax': '318', 'ymax': '357'}},
   {'name': 'microaneurysm',
    'pose': 'Unspecified',
    'truncated': '0',
    'difficult': '0',
    'bndbox': {'xmin': '369', 'ymin': '465', 'xmax': '406', 'ymax': '487'}}],
  'retinal_hemorrhage': [{'name': 'retinal hemorrhage',
    'pose': 'Unspecified',
    'truncated': '0',
    'difficult': '0',
    'bndbox': {'xmin': '354', 'ymin': '487', 'xmax': '402', 'ymax': '521'}},
   {'name': 'retinal hemorrhage',
    'pose': 'Unspecified',
    'truncated': '0',
    'difficult': '0',
    'bndbox': {'xmin': '610', 'ymin': '273', 'xmax': '655', 'ymax': '306'}}],
  'hard_exudate': [{'name': 'hard exudate',
    'pose': 'Unspecified',
    'truncated': '0',
    'difficult': '0',
    'bndbox': {'xmin': '365', 'ymin': '435', 'xmax': '406', 'ymax': '491'}},
   {'name': 'hard exudate',
    'pos

In [179]:
lesion_map = {l:idx for idx, l in enumerate(columns)}

print(lesion_map)

yolo_dataset = {}

for idx, img in enumerate(dataset.keys()):
    for idx, lesion in enumerate(dataset[img].keys()):
        if lesion in columns:
            objs = dataset[img][lesion]
        
            if not (type(objs) == list):
                objs = [objs]
                
            for obj in objs:
                bndbox= obj['bndbox']

                bndbox = [int(v) for v in list(bndbox.values())]
                # print(bndbox)
                xcenter = (bndbox[2]+bndbox[0])/2
                ycenter = (bndbox[3]+bndbox[1])/2
                width = bndbox[2]-bndbox[0]
                height = bndbox[3]-bndbox[1]
                
                bndbox = np.array([xcenter, ycenter, width, height])
                # print(bndbox)
                bndbox = bndbox / 896
                bndbox = [str(b) for b in bndbox.tolist()]

                lession_class = lesion_map[lesion]
                dp = [str(lession_class)]
                dp.extend(bndbox)
                dp = ' '.join(dp)

                if img not in yolo_dataset.keys():
                    yolo_dataset[img] = []
                
                yolo_dataset[img].append(dp)
yolo_dataset

{'microaneurysm': 0}


{'1413_left': ['0 0.3275669642857143 0.36495535714285715 0.0546875 0.06696428571428571',
  '0 0.43247767857142855 0.53125 0.041294642857142856 0.024553571428571428'],
 '229_right': ['0 0.41796875 0.44029017857142855 0.04352678571428571 0.07254464285714286'],
 '214_left': ['0 0.13950892857142858 0.26674107142857145 0.049107142857142856 0.05803571428571429',
  '0 0.04799107142857143 0.36495535714285715 0.05580357142857143 0.05133928571428571',
  '0 0.44754464285714285 0.2728794642857143 0.05357142857142857 0.036830357142857144',
  '0 0.5011160714285714 0.33816964285714285 0.046875 0.05133928571428571',
  '0 0.7109375 0.7784598214285714 0.060267857142857144 0.06361607142857142'],
 '837_right': ['0 0.45703125 0.5496651785714286 0.05915178571428571 0.04352678571428571'],
 '1632_right': ['0 0.3130580357142857 0.14899553571428573 0.05915178571428571 0.052455357142857144',
  '0 0.4760044642857143 0.16015625 0.036830357142857144 0.0390625',
  '0 0.6640625 0.18973214285714285 0.02232142857142857

In [180]:
yolo_df = pd.DataFrame(yolo_dataset.keys(), columns=['image'])
yolo_df

Unnamed: 0,image
0,1413_left
1,229_right
2,214_left
3,837_right
4,1632_right
5,1420_right
6,306_left
7,383_left
8,239_left
9,99_right


In [181]:
train_idx, remain_idx, train_y, remain_y = train_test_split(
    yolo_df.index,
    yolo_df['image'],
    train_size=0.8,
    test_size=0.2,
    random_state=42
)

val_idx, test_idx, val_y, test_y = train_test_split(
    remain_idx,
    remain_y,
    train_size=0.5,
    test_size=0.5,
    random_state=42
)

In [182]:
display(yolo_df.iloc[train_idx])
display(yolo_df.iloc[val_idx])
display(yolo_df.iloc[test_idx])

Unnamed: 0,image
12,508_right
4,1632_right
34,945_right
8,239_left
3,837_right
6,306_left
40,1043_left
41,172_left
46,1277_left
15,1369_right


Unnamed: 0,image
47,534_left
19,1471_right
17,658_right
44,1008_right
26,1396_right


Unnamed: 0,image
31,317_left
45,1532_left
27,1145_right
13,250_left
25,564_right


In [183]:
def clahe_image(path:str):
    # Load the image
    img = cv2.imread(path)
    mask = cv2.imread('mask.png', 0)

    # Extract the green plane
    green_plane = img[:, :, 1]

    # Define the FA-CLAHE algorithm
    faclahe = cv2.createCLAHE(clipLimit=5.0, tileGridSize=(8, 8))

    # Apply the FA-CLAHE algorithm to the green plane
    faclahe_img = faclahe.apply(green_plane)

    masked_img = cv2.bitwise_and(faclahe_img, faclahe_img, mask=mask)

    cv2.imwrite(path, masked_img)

In [184]:
IMG_PATH = 'Data/retinal-lesions-v20191227/images_896x896'

set_paths = [
    (yolo_df.iloc[train_idx]['image'].to_list(), 'train'),
    (yolo_df.iloc[val_idx]['image'].to_list(), 'val'),
    (yolo_df.iloc[test_idx]['image'].to_list(), 'test')
]

def create_db_path(base:str=DB_PATH, path:str=None):
    
    if path != None:
        path =  os.path.join(base, path)
        if not os.path.exists(path):
            print(f'Creating {path}')
            os.mkdir(path)
        return path
    else:
        print("Empty path, therefore not created")


im_path = create_db_path(path='images')
label_path = create_db_path(path='labels')

for sp in set_paths:
    for j, img in enumerate(yolo_dataset.keys()):
        current_set, set_path = sp
        if j==0:
            print(f'Subset:{set_path}')
        
        if img in current_set:
            print(f'[{j}/{len(yolo_dataset.keys())}] {im_dest_path}, {lb_path}')
            
            im_orig_path = os.path.join(IMG_PATH, img + '.jpg')
            im_dest_path = create_db_path(
                                    base=im_path,
                                    path=set_path)
            im_dest_path = os.path.join(im_dest_path, img + '.jpg')
            lb_path = create_db_path(
                                    base=label_path,
                                    path=set_path)
            lb_path = os.path.join(lb_path, img+'.txt')
            
            clahe_image(im_orig_path)
            copy(im_orig_path, im_dest_path)
            
            for obj in yolo_dataset[img]:
                print(obj)
                with open(file=lb_path, mode='a') as f:
                    f.write(obj+'\n')



# for sp in set_paths:
#     for j, idx in enumerate(yolo_dataset.keys()):
#         current_set, set_path = sp
#         if j==0:
#             print(f'Subset:{set_path}')
#         if idx in current_set:

#             im_orig_path = os.path.join(IMG_PATH, yolo_dataset[idx]['name']+'.jpg')
            

#             if not os.path.exists(im_dest_path):
            
            
            

Creating DB/images
Creating DB/labels
Subset:train
[0/49] DB/images/test/1532_left.jpg, DB/labels/test/1532_left.txt
Creating DB/images/train
Creating DB/labels/train
0 0.3275669642857143 0.36495535714285715 0.0546875 0.06696428571428571
0 0.43247767857142855 0.53125 0.041294642857142856 0.024553571428571428
[1/49] DB/images/train/1413_left.jpg, DB/labels/train/1413_left.txt
0 0.41796875 0.44029017857142855 0.04352678571428571 0.07254464285714286
[2/49] DB/images/train/229_right.jpg, DB/labels/train/229_right.txt
0 0.13950892857142858 0.26674107142857145 0.049107142857142856 0.05803571428571429
0 0.04799107142857143 0.36495535714285715 0.05580357142857143 0.05133928571428571
0 0.44754464285714285 0.2728794642857143 0.05357142857142857 0.036830357142857144
0 0.5011160714285714 0.33816964285714285 0.046875 0.05133928571428571
0 0.7109375 0.7784598214285714 0.060267857142857144 0.06361607142857142
[3/49] DB/images/train/214_left.jpg, DB/labels/train/214_left.txt
0 0.45703125 0.54966517857

In [185]:
import yaml

# Define the dataset directory and image subdirectories
dataset_root = DB_PATH
train_images_dir = "images/train"
val_images_dir = "images/val"
test_images_dir = "images/test"

# Define the dataset configuration
dataset_config = {
    "path": dataset_root,
    "train": train_images_dir,
    "val": val_images_dir,
    "test": test_images_dir,
    "names": {v:k for k,v in lesion_map.items()}
}

# Write the configuration to a YAML file
yaml_filepath = os.path.join(DB_PATH,"retinal_lesion.yaml")
with open(yaml_filepath, "w") as f:
    yaml.dump(dataset_config, f, sort_keys=False)

In [186]:
from shutil import make_archive

make_archive(
    base_name='DB-Microaneurysms',
    format='zip',
    base_dir=DB_PATH
)

'DB-Microaneurysms.zip'