# EDA RFMiD

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

import os
import progressbar

## Getting Data

In [5]:
RFMID_PATH = "./Data/Raw/MultiDiseaseClassification/RfmidMultidisease"

In [6]:
DB_PATH = './RAW_DATA'
if os.path.exists(DB_PATH):
    !rm -r {DB_PATH}
    os.mkdir(DB_PATH)
else:
    os.mkdir(DB_PATH)

In [7]:
def unzip_set(zipFile: str, outputPath: str = DB_PATH):
    print(zipFile)
    zipFile = os.path.sep.join([RFMID_PATH, zipFile])
    !7z x {zipFile} -o{outputPath}

In [8]:
[unzip_set(z) for z in os.listdir(RFMID_PATH) if '.zip' in z]

Evaluation_Set.zip

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,8 CPUs 11th Gen Intel(R) Core(TM) i5-11300H @ 3.10GHz (806C1),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan ./Data/Raw/MultiDiseaseClassification/RfmidMultidiseas                                                                1 file, 1564446026 bytes (1492 MiB)

Extracting archive: ./Data/Raw/MultiDiseaseClassification/RfmidMultidisease/Evaluation_Set.zip
--
Path = ./Data/Raw/MultiDiseaseClassification/RfmidMultidisease/Evaluation_Set.zip
Type = zip
Physical Size = 1564446026

     10% 102 - Evaluation_Set/Validation/189.p                                           15% 144 - Evaluation_Set/Validation/226.p                                           20% 188 - Evaluation_Set/Validation/266.p                                           24% 222 - Evaluation_Set/Validation/297.p                                           32% 296 -

[None, None, None]

In [9]:
for f in os.listdir(DB_PATH):
    !mv {os.path.sep.join([DB_PATH, f])}/* {DB_PATH}
    !rm -r {os.path.sep.join([DB_PATH, f])}

## Exploring the dataset

In [10]:
csv_path = os.path.sep.join([DB_PATH, 'RFMiD_Training_Labels.csv'])
training_df = pd.read_csv(csv_path)

training_df.dropna(how='all', axis=0, inplace=True)

# get ID + specified disease column
dr_training_df = training_df.loc[:, training_df.columns.isin(['ID', 'DR'])]

In [11]:
dr_training_df.groupby('DR')['ID'].count()

DR
0    1544
1     376
Name: ID, dtype: int64

In [12]:
balanced = dr_training_df.groupby('DR').sample(376)

balanced.groupby('DR')['ID'].count()

DR
0    376
1    376
Name: ID, dtype: int64

In [13]:
DATASET_PATH = './DB'
if os.path.exists(DATASET_PATH):
    !rm -r {DATASET_PATH}
    os.mkdir(DATASET_PATH)
else:
    os.mkdir(DATASET_PATH)

## Restructuring the dataset

In [14]:
dir_structure = {f'{i}':list(balanced.loc[balanced['DR'] == i].pop("ID")) for i in range(2)}

for k in dir_structure.keys():
        set_path = os.path.sep.join([DATASET_PATH, 'Training'])

        if not os.path.exists(set_path):
            os.mkdir(set_path)

        class_path = os.path.sep.join([set_path, k])

        if not os.path.exists(class_path):
            os.mkdir(class_path)
        
        widgets = [
            f"Building Dataset {'Training'}/{k}: ",
            progressbar.Percentage(),
            " ",
            progressbar.Bar(),
            " ",
            progressbar.ETA()
        ]

        
        for i, v in enumerate(dir_structure[k]):


            pbar = progressbar.ProgressBar(
                maxval=len(dir_structure[k]),
                widgets=widgets
            ).start()

            destination = os.path.sep.join([class_path, f'{v}.png'])
            origin = os.path.sep.join([DB_PATH, 'Training', f'{v}.png'])
            !mv {origin} {destination}

            pbar.update(i)

        pbar.finish()

Building Dataset Training/0: 100% |#############################| Time: 0:00:00
Building Dataset Training/1: 100% |#############################| Time: 0:00:00


In [15]:
def restructure_set(csv_file: str, folder: str, column: str):

    df_columns = ["ID"]
    df_columns.append(column)

    # read csv file from the DB path
    csv_path = os.path.sep.join([DB_PATH, csv_file])
    df = pd.read_csv(csv_path)
    
    df.dropna(how='all', axis=0, inplace=True)

    # get ID + specified disease column
    dr_df = df.loc[:, df.columns.isin(['ID', 'DR'])]
    
    # dictionary column:[ID]
    dir_structure = {f'{i}':list(dr_df.loc[dr_df[column] == i].pop("ID")) for i in range(2)}

    # creating a folder named as {column} containing all files in [ID]
    for k in dir_structure.keys():
        set_path = os.path.sep.join([DATASET_PATH, folder])

        if not os.path.exists(set_path):
            os.mkdir(set_path)

        class_path = os.path.sep.join([set_path, k])

        if not os.path.exists(class_path):
            os.mkdir(class_path)
        
        widgets = [
            f"Building Dataset {folder}/{k}: ",
            progressbar.Percentage(),
            " ",
            progressbar.Bar(),
            " ",
            progressbar.ETA()
        ]

        
        for i, v in enumerate(dir_structure[k]):


            pbar = progressbar.ProgressBar(
                maxval=len(dir_structure[k]),
                widgets=widgets
            ).start()

            destination = os.path.sep.join([class_path, f'{v}.png'])
            origin = os.path.sep.join([DB_PATH, folder, f'{v}.png'])
            !mv {origin} {destination}

            pbar.update(i)

        pbar.finish()

In [16]:
not_training = [f for f in os.listdir(DB_PATH) if 'Training' not in f]
csv_files = [f for f in not_training if '.csv' in f]
folders = [f for f in not_training if f not in csv_files]

for csv in csv_files:
    for f in folders:
        if f in csv: # folder name is included in csv filename, which means folder and csv file match
            restructure_set(csv, f, 'DR')

Building Dataset Validation/0: 100% |###########################| Time: 0:00:00
Building Dataset Validation/1: 100% |###########################| Time: 0:00:00
Building Dataset Test/0: 100% |#################################| Time: 0:00:00
Building Dataset Test/1: 100% |#################################| Time: 0:00:00
