# EDA RFMiD

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

import os
import progressbar

## Getting Data

In [57]:
RFMID_PATH = "./Data/Raw/MultiDiseaseClassification/RfmidMultidisease"

In [58]:
DB_PATH = './DB'
if os.path.exists(DB_PATH):
    !rm -r {DB_PATH}
    os.mkdir(DB_PATH)
else:
    os.mkdir(DB_PATH)

In [59]:
def unzip_set(zipFile: str, outputPath: str = DB_PATH):
    print(zipFile)
    zipFile = os.path.sep.join([RFMID_PATH, zipFile])
    !7z x {zipFile} -o{outputPath}

In [60]:
[unzip_set(z) for z in os.listdir(RFMID_PATH) if '.zip' in z]

Evaluation_Set.zip

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,8 CPUs 11th Gen Intel(R) Core(TM) i5-11300H @ 3.10GHz (806C1),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan ./Data/Raw/MultiDiseaseClassification/RfmidMultidiseas                                                                1 file, 1564446026 bytes (1492 MiB)

Extracting archive: ./Data/Raw/MultiDiseaseClassification/RfmidMultidisease/Evaluation_Set.zip
--
Path = ./Data/Raw/MultiDiseaseClassification/RfmidMultidisease/Evaluation_Set.zip
Type = zip
Physical Size = 1564446026

     12% 119 - Evaluation_Set/Validation/203.p                                           25% 238 - Evaluation_Set/Validation/310.p                                           40% 374 - Evaluation_Set/Validation/433.p                                           54% 460 - Evaluation_Set/Validation/510.p                                           70% 500 -

[None, None, None]

In [61]:
for f in os.listdir(DB_PATH):
    !mv {os.path.sep.join([DB_PATH, f])}/* {DB_PATH}
    !rm -r {os.path.sep.join([DB_PATH, f])}

## Restructuring the dataset

In [1]:
def restructure_set(csv_file: str, folder: str, column: str):

    df_columns = ["ID"]
    df_columns.append(column)

    # read csv file from the DB path
    csv_path = os.path.sep.join([DB_PATH, csv_file])
    df = pd.read_csv(csv_path)
    df.dropna(how='all', axis=0, inplace=True)

    # get ID + specified disease column
    dr_df = df.loc[:, df.columns.isin(df_columns)]

    # dictionary column:[ID]
    dir_structure = {f'{i}':list(dr_df.loc[dr_df[column] == i].pop("ID")) for i in range(2)}

    # creating a folder named as {column} containing all files in [ID]
    for k in dir_structure.keys():
        class_path = os.path.sep.join([DB_PATH, folder, k])

        if not os.path.exists(class_path):
            os.mkdir(class_path)
        
        widgets = [
            f"Building Dataset {folder}/{k}: ",
            progressbar.Percentage(),
            " ",
            progressbar.Bar(),
            " ",
            progressbar.ETA()
        ]

        
        for i, v in enumerate(dir_structure[k]):


            pbar = progressbar.ProgressBar(
                maxval=len(dir_structure[k]),
                widgets=widgets
            ).start()

            destination = os.path.sep.join([class_path, f'{v}.png'])
            origin = os.path.sep.join([DB_PATH, folder, f'{v}.png'])
            !mv {origin} {destination}

            pbar.update(i)

        pbar.finish()

In [63]:
csv_files = [f for f in os.listdir(DB_PATH) if '.csv' in f]
folders = [f for f in os.listdir(DB_PATH) if f not in csv_files]

for csv in csv_files:
    for f in folders:
        if f in csv:
            restructure_set(csv, f, 'DR')

Building Dataset Training/0: 100% |#############################| Time: 0:00:00
Building Dataset Training/1: 100% |#############################| Time: 0:00:00
Building Dataset Validation/0: 100% |###########################| Time: 0:00:00
Building Dataset Validation/1: 100% |###########################| Time: 0:00:00
Building Dataset Test/0: 100% |#################################| Time: 0:00:00
Building Dataset Test/1: 100% |#################################| Time: 0:00:00


## Serializing the dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tesis_lib.preprocessing.aspectawareprocessor import AspectAwareProcessor
from tesis_lib.io.hdf5datasetwriter import HDF5DatasetWriter

from imutils import paths
import numpy as np
import progressbar
import json
import cv2
import os

In [None]:
DATASET_PATH = './DB'

IM_SIZE = 256
NUM_CLASSES = 2

In [None]:
DATASET_HDF5_PATH = os.path.sep.join([DATASET_PATH, 'hdf5'])
if os.path.exists(DATASET_HDF5_PATH):
    !rm -r {DATASET_HDF5_PATH}

os.mkdir(DATASET_HDF5_PATH)

In [None]:
aap = AspectAwareProcessor(IM_SIZE,IM_SIZE)
# iap = ImageToArrayPreprocessor()
(R,G,B) = ([],[],[])

path = os.path.sep.join([DATASET_PATH, "Training"])
class_paths = [os.path.sep.join([path, im_class]) for im_class in os.listdir(path)]

imagePaths = []
[imagePaths.extend(paths.list_images(cp)) for cp in class_paths]
labels = [pt.split(os.path.sep)[-2] for pt in imagePaths]

le = LabelEncoder()
labels = le.fit_transform(labels)

(trainPaths, valPaths,trainLabels,valLabels) = train_test_split(
  imagePaths,
  labels,
  train_size=450,
  test_size=100,
  stratify=labels, 
  random_state = 42)

assert trainLabels.shape[0] == len(trainPaths)
assert valLabels.shape[0] == len(valPaths)

In [None]:
print(f"Training data points = {trainLabels.shape[0]}")
print(f"Training data points = {valLabels.shape[0]}")

Training data points = 450
Training data points = 100


In [None]:
path = os.path.sep.join([DATASET_PATH, "Test"])
class_paths = [os.path.sep.join([path, im_class]) for im_class in os.listdir(path)]

imagePaths = []
[imagePaths.extend(paths.list_images(cp)) for cp in class_paths]
labels = [pt.split(os.path.sep)[-2] for pt in imagePaths]

le = LabelEncoder()
labels = le.fit_transform(labels)

(_, testPaths,_,testLabels) = train_test_split(
  imagePaths,
  labels,
  train_size=450,
  test_size=124,
  stratify=labels, 
  random_state = 42)

assert testLabels.shape[0] == len(testPaths)

In [None]:
print(f"Training data points = {testLabels.shape[0]}")

Training data points = 124


In [None]:
DATA_PATHS = [
    ('train', trainPaths, trainLabels, f'./DB/hdf5/Training.hdf5'),
    ('val', valPaths, valLabels, f'./DB/hdf5/Validation.hdf5'),
    ('test', testPaths, testLabels, f'./DB/hdf5/Testing.hdf5'),
]

for (dType, imagePaths, labels, output) in DATA_PATHS:
  if os.path.exists(output):
    os.remove(output)
  writer = HDF5DatasetWriter((len(imagePaths), IM_SIZE,IM_SIZE,3), output)

  widgets = [
      f"Building {dType} Set: ",
      progressbar.Percentage(),
      " ",
      progressbar.Bar(),
      " ",
      progressbar.ETA()
  ]

  pbar = progressbar.ProgressBar(
      maxval=len(imagePaths),
      widgets=widgets
      ).start()

  for (i, (path,label)) in enumerate(zip(imagePaths, labels)):
      image = cv2.imread(path)
      try:
        image = aap.preprocess(image)
      except Exception:
        display(f"[WARNING] Skipped {path.split('/')[-1]}")
        continue

      if dType == "train":
        (b,g,r) = cv2.mean(image)[:3]
        R.append(r)
        G.append(g)
        B.append(b)
      
      writer.add([image], [label])
      pbar.update(i)

  pbar.finish()
  writer.close()

Building train Set: 100% |######################################| Time: 0:00:29
Building val Set: 100% |########################################| Time: 0:00:08
Building test Set: 100% |#######################################| Time: 0:00:11


In [None]:
print("[INFO] serializing means...")
D = {
    "R": np.mean(R),
    "G": np.mean(G),
    "B": np.mean(B)
}
with open('./DB/hdf5/diat_ret.json', "w") as f:
    f.write(json.dumps(D))

[INFO] serializing means...
