# Dependencies

```pip install shutil```

```pip install mritopng```


# Get Ratings from CSV

In [2]:
import csv

# returns a list of pairs of DICOM file names and their respective label
# returns 3 for any unlablled image
def get_ratings_list(log_path, name_index = 1, label_index=2):
    
    ratings_list = []
    
    with open(log_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        row_count = 0

        for row in csv_reader:

            # ignore column name row
            if (row_count != 0):
                
                if (int(row[label_index]) > 2):
                    ratings_list.append([row[name_index], 3])
                else:
                    
                    ratings_list.append([row[name_index], int(row[label_index])])

            row_count = row_count + 1
        
    return ratings_list


# Generate Folder Structure from CSV (3-class)

In [5]:
import shutil
import mritopng
import os

# generates a data folder where the subfolder are labels and the subfolder of those labels are converted PNG images

#####------ ADJUST SETTINGS BELOW ------#####

log_path = "./data/logs/"
data_pool_path = "./data/data_pool_dicom/"
output_dir = "./data/generated_sets/3class/"

rater_log_paths = ["ratings_ali.csv", 
                   "ratings_peter.csv", 
                   "ratings_ali.csv", 
                   "ratings_peter.csv"]

output_paths = ["ali_labels_original/", 
                "peter_labels_original/", 
                "ali_labels_undersampled/", 
                "peter_labels_undersampled/" ]

my_name_index = 1 # column index of the DICOM filename in the CSV log
my_label_index = 2 # column index of the label to choose (2 for rating 1, 3 for rating 2)

#####------ ADJUST SETTINGS ABOVE ------#####


for rater_log, output_path in zip(rater_log_paths, output_paths):
    
    print("using rater log: " + rater_log)
    print("outputting to: " + output_dir + output_path)
    
    # start with new folder
    if os.path.exists(output_dir + output_path):
        shutil.rmtree(output_dir + output_path)
    os.mkdir(output_dir + output_path)

    # create subfolders
    os.mkdir(output_dir + output_path + "0/")
    os.mkdir(output_dir + output_path + "1/")
    os.mkdir(output_dir + output_path + "2/")

    # get ratings list
    ratings = get_ratings_list(log_path + rater_log, name_index=my_name_index, label_index=my_label_index)
    
    # progress display
    total_len = len(ratings)
    count = 0
    
    for elem in ratings:

        if (count % 100 == 0):
            print("progress: " + str(count) + "/" + str(total_len))

        # ignore DICOMs with no label (labeled as 100+)
        if (elem[1] <= 2):
            
            # try to convert file
            try:
                mritopng.convert_file(data_pool_path + elem[0],  
                                      output_dir + output_path + str(elem[1]) + "/" + elem[0] + ".png")

            except Exception as err:
                
                print(err)
                print(data_pool_path + elem[0])
            
            
        count = count + 1

# above seems to take care of contrast as well
        
    

using rater log: ratings_ali.csv
outputting to: ./data/generated_sets/3class/ali_labels_original/
progress: 0/2111
[Errno 2] No such file or directory: './data/generated_sets/3class/ali_labels_original/3/IM-0022-0025-0001.dcm.png'
./data/data_pool_dicom/IM-0022-0025-0001.dcm
progress: 100/2111
progress: 200/2111
progress: 300/2111
progress: 400/2111
progress: 500/2111
progress: 600/2111
[Errno 2] No such file or directory: './data/generated_sets/3class/ali_labels_original/3/IM-0070-0005-0001.dcm.png'
./data/data_pool_dicom/IM-0070-0005-0001.dcm
progress: 700/2111
progress: 800/2111
progress: 900/2111
progress: 1000/2111
progress: 1100/2111
progress: 1200/2111
progress: 1300/2111
progress: 1400/2111
progress: 1500/2111
progress: 1600/2111
[Errno 2] No such file or directory: './data/generated_sets/3class/ali_labels_original/3/IM-0038-0009-0001.dcm.png'
./data/data_pool_dicom/IM-0038-0009-0001.dcm
progress: 1700/2111
progress: 1800/2111
progress: 1900/2111
progress: 2000/2111
[Errno 2] N

# Generate Folder Structure from CSV (2-class)

In [8]:
import shutil
import mritopng
import os

# same idea as above block, just with only two classes

#####------ ADJUST SETTINGS BELOW ------#####

log_path = "./data/logs/"
data_pool_path = "./data/data_pool_dicom/"
output_dir = "./data/generated_sets/2class/"

rater_log_paths = ["ratings_ali.csv", "ratings_peter.csv"]
output_paths = ["ali_labels_undersampled/", "peter_labels_undersampled/"]

#####------ ADJUST SETTINGS ABOVE ------#####

for rater_log, output_path in zip(rater_log_paths, output_paths):
    
    print("using rater log: " + rater_log)
    print("outputting to: " + output_dir + output_path)
    
    if os.path.exists(output_dir + output_path):
        shutil.rmtree(output_dir + output_path)
        
    os.mkdir(output_dir + output_path)

    os.mkdir(output_dir + output_path + "0/")
    os.mkdir(output_dir + output_path + "1/")

    ratings = get_ratings_list(log_path + rater_log)
    
    total_len = len(ratings)
    count = 0
    
    for elem in ratings:

        
        if (count % 100 == 0):
            print("progress: " + str(count) + "/" + str(total_len))

        if (elem[1] <= 2):
            
            try:
                
                if elem[1] == 1 or elem[1] == 2:
                
                    mritopng.convert_file(data_pool_path + elem[0],  
                                          output_dir + output_path + str(1) + "/" + elem[0] + ".png")
                else:
                    
                    mritopng.convert_file(data_pool_path + elem[0],  
                                          output_dir + output_path + str(0) + "/" + elem[0] + ".png")

            except Exception as err:
                
                print(err)
                print(data_pool_path + elem[0])
            
            
        count = count + 1
  

using rater log: ratings_ali.csv
outputting to: ./data/generated_sets/2class/ali_labels_undersampled/
progress: 0/2111
progress: 100/2111
progress: 200/2111
progress: 300/2111
progress: 400/2111
progress: 500/2111
progress: 600/2111
progress: 700/2111
progress: 800/2111
progress: 900/2111
progress: 1000/2111
progress: 1100/2111
progress: 1200/2111
progress: 1300/2111
progress: 1400/2111
progress: 1500/2111
progress: 1600/2111
progress: 1700/2111
progress: 1800/2111
progress: 1900/2111
progress: 2000/2111
progress: 2100/2111
using rater log: ratings_peter.csv
outputting to: ./data/generated_sets/2class/peter_labels_undersampled/
progress: 0/2111
progress: 100/2111
Removing existing output file ./data/generated_sets/2class/peter_labels_undersampled/0/IM-0012-0004-0001.dcm.png
Removing existing output file ./data/generated_sets/2class/peter_labels_undersampled/0/IM-0012-0004-0001.dcm.png
progress: 200/2111
Removing existing output file ./data/generated_sets/2class/peter_labels_undersample

# Split into Train, Eval, Test

In [7]:
import util.load_data_utility as load_data_utility
import shutil
import os
import pathlib

#####------ ADJUST SETTINGS BELOW ------#####

# input data roots must be output directories from the folders above (must have the 0/, 1/, etc structure already)

data_roots = ["./data/generated_sets/3class_rating2/ali_labels/",
              "./data/generated_sets/3class_rating2/ali_labels_undersampled//",
              "./data/generated_sets/3class_rating2/peter_labels/",
              "./data/generated_sets/3class_rating2/peter_labels_undersampled//"]

output_folders = ["./data/generated_splits/3class_rating2/ali_relabeled_original/",
                  "./data/generated_splits/3class_rating2/ali_relabeled_undersampled/",
                  "./data/generated_splits/3class_rating2/peter_relabeled_original/",
                  "./data/generated_splits/3class_rating2/peter_relabeled_undersampled/"]

split_ratios = [0.7, 0.1, 0.2]

#####------ ADJUST SETTINGS ABOVE ------#####

for data_root, output_folder in zip(data_roots, output_folders):

    image_paths = load_data_utility.load_image_paths(data_root)

    print(image_paths[:5])

    train_paths, eval_paths, test_paths = load_data_utility.split(image_paths, split=split_ratios, seed=777)

    # get parent directory for label
    def get_label(path):

        path_location = pathlib.Path(path)

        str1 = str(path_location.parents[0])
        str2 = str(path_location.parents[1])

        retval = str1.replace(str2, '')
        retval = retval.replace("/", '')

        return retval


    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    os.mkdir(output_folder)
    os.mkdir(output_folder + "train/")
    os.mkdir(output_folder + "eval/")
    os.mkdir(output_folder + "test/")
    os.mkdir(output_folder + "train/0/")
    os.mkdir(output_folder + "train/1/")
    os.mkdir(output_folder + "train/2/")
    os.mkdir(output_folder + "eval/0/")
    os.mkdir(output_folder + "eval/1/")
    os.mkdir(output_folder + "eval/2/")
    os.mkdir(output_folder + "test/0/")
    os.mkdir(output_folder + "test/1/")
    os.mkdir(output_folder + "test/2/")


    for pngfile in train_paths:
        shutil.copy(pngfile, output_folder + "train/" + get_label(pngfile) + "/" )

    for pngfile in eval_paths:
        shutil.copy(pngfile, output_folder + "eval/" + get_label(pngfile) + "/" )

    for pngfile in test_paths:
        shutil.copy(pngfile, output_folder + "test/" + get_label(pngfile) + "/" )


['data/generated_sets/3class_rating2/ali_labels/1/IM-0071-0019-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels/1/IM-0017-0009-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels/1/IM-0049-0016-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels/1/IM-0030-0010-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels/1/IM-0005-0019-0001.dcm.png']
data/generated_sets/3class_rating2/ali_labels/2/IM-0016-0020-0001.dcm.png
2
['data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0071-0019-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0017-0009-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0049-0016-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0030-0010-0001.dcm.png', 'data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0005-0019-0001.dcm.png']
data/generated_sets/3class_rating2/ali_labels_undersampled/1/IM-0035-0011-0001.dcm.png


# Dataset Balancing

## Balancing Utility Functions

In [7]:
import pathlib
import random

# load an array of image paths, indexed by their label
def load_sorted_paths(path):
    
    data_root = pathlib.Path(path)
    
    paths = []
    
    for child in sorted(data_root.iterdir(), key=(lambda x: x.name):
        if (child.is_dir() and child.name != ".DS_Store"):
            print(child)
            
            subdir_paths = list(child.glob('**/*'))
            subdir_paths = [str(path) for path in subdir_paths if path.name != ".DS_Store"]
        
            paths.append(subdir_paths)
            
    return paths

# load just an array of image paths
def load_dir_paths(path):
    
    data_root = pathlib.Path(path)
    
    paths = []
    
    for child in data_root.iterdir():
        if (child.name != ".DS_Store"):
         
            paths.append(str(child))
    
    return paths

## Inserting Noisy Images

In [None]:
import synthetic_motion_utility as synth
import imageio

input_data_dir = "./data/generated_splits/train_set_balanced/train/1/"
output_data_dir = "./data/generated_splits/train_set_balanced/train/0/"

all_image_paths = load_dir_paths(input_data_dir)

num_synthetic = 400
counter = 0

random.seed(a=777)
random.shuffle(all_image_paths)

while counter < num_synthetic:
    
    im = imageio.imread(all_image_paths[counter])
    
    im = synth.add_motion_artifact(im, seed=counter)
    
    imageio.imwrite(output_data_dir + "synthetic_" + str(counter) + ".png", im)
    
    counter = counter + 1

## Oversampling 2-Label Images

In [None]:
import os, glob, shutil

# directory of 2 label images to be oversampled
data_root_2 = "./data/generated_splits/train_set_balanced/train/2/"

# load image paths in in this directory
image_paths_2 = load_dir_paths(data_root_2)


# remove any pervious duplicates
for filename in glob.glob(data_root_2 + "duplicate*"):
    os.remove(filename) 

# oversample    
counter = 0
for pngfile in image_paths_2:
    shutil.copy(pngfile, data_root_2 + "duplicate_" +  str(counter) + ".png" )
    counter = counter + 1


## Undersampling Images

In [9]:
import os
import random

random.seed(a=777)

# directory of set to be undersampled
data_root = "./data/generated_sets/3class/ali_labels_undersampled/"
data_root = data_root + "1/"

# get paths in directory
paths_in_dir = load_dir_paths(data_root)

#shuffle randomly
random.shuffle(paths_in_dir)

print(paths_in_dir[:10])

random.seed(a=777)

# randmoly remove half using random.choice
for path in paths_in_dir:
    
    if (random.choice([True, False])):
        print("removing: " + path)
        os.remove(path)

['data/generated_sets/3class/ali_labels_undersampled/1/IM-0049-0004-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0019-0030-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0064-0016-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0025-0019-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0060-0024-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0018-0027-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0038-0008-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0045-0003-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0013-0022-0001.dcm.png', 'data/generated_sets/3class/ali_labels_undersampled/1/IM-0066-0005-0001.dcm.png']
removing: data/generated_sets/3class/ali_labels_undersampled/1/IM-0049-0004-0001.dcm.png
removing: data/generated_sets/3class/ali_labels_undersampled/1/IM-0038-0008-0001.dcm.png
r