## Dataset for Multi-Label-Classification
Keep all instances than contain the four classes, wheter alone or in combination with other illness. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import random

plt.rcParams["axes.grid"] = False
plt.style.use('dark_background')
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading csv file
df = pd.read_csv('train_resampled.csv')
df.head()

Unnamed: 0,filename,opacity,diabetic retinopathy,glaucoma,macular edema,macular degeneration,retinal vascular occlusion,normal,new
0,c24a1b14d253.jpg,0,0,0,0,0,1,0,retinal vascular occlusion
1,9ee905a41651.jpg,0,0,0,0,0,1,0,retinal vascular occlusion
2,3f58d128caf6.jpg,0,0,1,0,0,0,0,glaucoma
3,4ce6599e7b20.jpg,1,0,0,0,1,0,0,"opacity, macular degeneration"
4,0def470360e4.jpg,1,0,0,0,1,0,0,"opacity, macular degeneration"


In [3]:
# check the file
df.shape

(4704, 9)

In [5]:
# check the values
df.new.value_counts()

retinal vascular occlusion                                                  500
opacity                                                                     500
glaucoma                                                                    500
macular edema                                                               500
diabetic retinopathy                                                        500
normal                                                                      500
macular degeneration                                                        500
opacity, macular degeneration                                               212
opacity, diabetic retinopathy                                               186
diabetic retinopathy, macular edema                                         162
opacity, diabetic retinopathy, macular edema                                154
opacity, glaucoma                                                           140
macular edema, retinal vascular occlusio

In [6]:
# Sanity Check
image_list = os.listdir(r'D:\data\Projects\notebooks\RetinaAI\01_Preprocessing\train_resampled\\')

In [7]:
filenames_list = df.filename.to_list()

In [8]:
sorted(image_list) == sorted(filenames_list)

True

In [21]:
indir = r'D:\data\Projects\notebooks\RetinaAI\01_Preprocessing\train_resampled'
outdir = r'D:\data\Projects\notebooks\RetinaAI\01_Preprocessing\train_multilabel4'

In [22]:
df.columns

Index(['filename', 'opacity', 'diabetic retinopathy', 'glaucoma',
       'macular edema', 'macular degeneration', 'retinal vascular occlusion',
       'normal', 'new'],
      dtype='object')

In [18]:
# make list of instances that will be extraced
dr = df[df['diabetic retinopathy'] == 1]['filename'].tolist()
me = df[df['macular edema'] == 1]['filename'].tolist()
rvo = df[df['retinal vascular occlusion'] == 1]['filename'].tolist()

In [19]:
complete = dr+me+rvo

In [20]:
len(complete)

2802

In [24]:
# get rid of duplicates
comp = set(complete)

In [25]:
len(comp)

2305

In [27]:
# remove images from list in directory
for image in comp:
        os.remove(outdir + '\\' + image)

In [28]:
# Create a new datframe that only contains images not deleted
df_new = df[~df.filename.isin(comp)]

In [29]:
# check shape
df_new.shape

(2399, 9)

In [31]:
# check instances
df_new.new.value_counts()

glaucoma                                   500
macular degeneration                       500
opacity                                    500
normal                                     500
opacity, macular degeneration              212
opacity, glaucoma                          140
glaucoma, macular degeneration              31
opacity, glaucoma, macular degeneration     16
Name: new, dtype: int64

In [32]:
# Sanity Check
image_list = os.listdir(outdir)

In [33]:
filenames_list = df_new.filename.to_list()

In [34]:
sorted(image_list) == sorted(filenames_list)

True

In [35]:
df_new.to_csv('multilabel4.csv', sep=',', encoding='utf-8', index=False)