In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
import shutil
import math
import pydicom
import cv2
import random
random.seed(10)

cmmd_manifest_directory = "/media/craig/Larry/python/manifest-1616439774456/"
parent_dir = (cmmd_manifest_directory[:-23]) #e.g. "/media/craig/Larry/python"

In [2]:
df = pd.read_csv("./CMMD_metadata_subset.csv")
df

Unnamed: 0,subject_id,leftright,age,abnormality,classification,subtype,file_location
0,D1-0001,R,44,calcification,Benign,,./CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000...
1,D1-0001,R,44,calcification,Benign,,./CMMD/D1-0001/07-18-2010-NA-NA-79377/1.000000...
2,D1-0002,L,40,calcification,Benign,,./CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000...
3,D1-0002,L,40,calcification,Benign,,./CMMD/D1-0002/07-18-2010-NA-NA-49231/1.000000...
4,D1-0003,L,39,calcification,Benign,,./CMMD/D1-0003/07-18-2011-NA-NA-25491/1.000000...
...,...,...,...,...,...,...,...
3363,D2-0564,R,30,mass,Malignant,Luminal B,./CMMD/D2-0564/07-18-2011-NA-NA-19553/1.000000...
3364,D2-0565,L,75,mass,Malignant,,./CMMD/D2-0565/07-18-2011-NA-NA-72392/1.000000...
3365,D2-0565,L,75,mass,Malignant,,./CMMD/D2-0565/07-18-2011-NA-NA-72392/1.000000...
3366,D2-0565,R,75,mass,Malignant,Luminal A,./CMMD/D2-0565/07-18-2011-NA-NA-72392/1.000000...


In [3]:
#Create dataframe which excludes all non benign classifications
benign_df = df.loc[df['classification'] == 'Benign']

#Create dataframe which excludes all non malignant classifications
malignant_df = df.loc[df['classification'] == 'Malignant']

cmmd_dir = parent_dir
benign_loc = cmmd_dir+"cmmd_data/benign/"
malignant_loc = cmmd_dir+"cmmd_data/malignant/"

#create directory if doesnt exist
Path(benign_loc).mkdir(parents=True, exist_ok=True)
#create directory if doesnt exist
Path(malignant_loc).mkdir(parents=True, exist_ok=True)      

matches = ["1-3.dcm", "1-4.dcm"]

def create_benign_malignant(df, dest_folder):
    
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
        src = cmmd_dir+"manifest-1616439774456/"+row['file_location']
        basename = os.path.basename(src) #<- basename = file name + extension
        if any(x in basename for x in matches): # Check for a 3rd or 4th in path
            #append "_b" to subject ID to show this is a second case for the same patient
            dest = dest_folder+row['subject_id']+"_b/"+basename 
            Path(dest_folder+row['subject_id']+"_b/").mkdir(parents=True, exist_ok=True)
        else:
            dest = dest_folder+row['subject_id']+"/"+basename
            Path(dest_folder+row['subject_id']+"/").mkdir(parents=True, exist_ok=True)
        shutil.copyfile(src, dest)

print("Building benign")
create_benign_malignant(benign_df, benign_loc)
print("Building malignant")
create_benign_malignant(malignant_df, malignant_loc)

Building benign


100%|██████████| 1106/1106 [00:21<00:00, 52.66it/s]


Building malignant


100%|██████████| 2262/2262 [00:43<00:00, 52.20it/s]


In [4]:
#method to move 20% of a directory into another location
#splits a dataset into train/test and/or train/validate
def create_test_dataset(data_location, destination):
    count = (len(os.listdir(data_location))/5)
    count = (math.ceil(count))
    test_set = random.sample(os.listdir(data_location), count)

    for i in tqdm(range(len(test_set))):

        shutil.move(data_location+test_set[i], destination)


In [5]:
benign_testset_location = cmmd_dir+"TEST/benign/"
malignant_testset_location = cmmd_dir+"TEST/malignant/"
#create directory if doesnt exist
Path(benign_testset_location).mkdir(parents=True, exist_ok=True)
#create directory if doesnt exist
Path(malignant_testset_location).mkdir(parents=True, exist_ok=True)   
    
print("Creating 20% test split for benign set...")        
create_test_dataset(benign_loc, benign_testset_location)

print("Creating 20% test split for malignant set...")  
create_test_dataset(malignant_loc, malignant_testset_location)

Creating 20% test split for benign set...


100%|██████████| 111/111 [00:00<00:00, 4665.57it/s]


Creating 20% test split for malignant set...


100%|██████████| 227/227 [00:00<00:00, 4323.53it/s]


In [6]:
benign_valset_location = cmmd_dir+"VAL/benign/"
malignant_valset_location = cmmd_dir+"VAL/malignant/"
#create directory if doesnt exist
Path(benign_valset_location).mkdir(parents=True, exist_ok=True)
#create directory if doesnt exist
Path(malignant_valset_location).mkdir(parents=True, exist_ok=True)

create_test_dataset(benign_loc, benign_valset_location)
create_test_dataset(malignant_loc, malignant_valset_location)

100%|██████████| 89/89 [00:00<00:00, 4007.61it/s]
100%|██████████| 181/181 [00:00<00:00, 3552.30it/s]


In [7]:
shutil.move(cmmd_dir+"cmmd_data", cmmd_dir+"TRAIN")

'/media/craig/Larry/python/TRAIN'

In [8]:
def move_dcm_from_subdir(source, destination):
    Path(destination).mkdir(parents=True, exist_ok=True)
    files_list = os.listdir(source)
    j=1
    for files in files_list:
        files_list2 = os.listdir(source+files)
        for x in files_list2:
            shutil.move(source+files+"/"+x, destination+str(j)+".dcm")
            j+=1
        
move_dcm_from_subdir(source=cmmd_dir+"TRAIN/benign/",
                     destination = cmmd_dir+"cmmd_data/TRAIN/benign/")
move_dcm_from_subdir(source=cmmd_dir+"TRAIN/malignant/",
                     destination = cmmd_dir+"cmmd_data/TRAIN/malignant/")

move_dcm_from_subdir(source=cmmd_dir+"VAL/benign/",
                     destination = cmmd_dir+"cmmd_data/VAL/benign/")
move_dcm_from_subdir(source=cmmd_dir+"VAL/malignant/",
                     destination = cmmd_dir+"cmmd_data/VAL/malignant/")

move_dcm_from_subdir(source=cmmd_dir+"TEST/benign/",
                     destination = cmmd_dir+"cmmd_data/TEST/benign/")
move_dcm_from_subdir(source=cmmd_dir+"TEST/malignant/",
                     destination = cmmd_dir+"cmmd_data/TEST/malignant/")



In [9]:
def rm_dir(directiory):
    ## Try to remove tree
    try:
        shutil.rmtree(directiory)
    except OSError as e:
        print ("Error: %s - %s." % (e.filename, e.strerror))

rm_dir(cmmd_dir+"TRAIN")
rm_dir(cmmd_dir+"TEST")
rm_dir(cmmd_dir+"VAL")

In [10]:
def convert_dicom_to_png(input_dir, output_dir):
    
    if not os.path.exists(output_dir): #if file doesnt exist, create it
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
    classification_list = [ classification for classification in  os.listdir(input_dir)]
    for classification in classification_list:
        print("Working on "+classification+" for \n"+output_dir)
        dicom_list = [ dcm_image for dcm_image in  os.listdir(input_dir+classification)]
        for dcm_image in tqdm(dicom_list):
            ds = pydicom.read_file(input_dir+classification+"/"+dcm_image) # read dicom image
            img = ds.pixel_array # get image array
            if not os.path.exists(output_dir + classification): #if file doesnt exist, create it
                Path(output_dir + classification).mkdir(parents=True, exist_ok=True)
            cv2.imwrite(output_dir + classification + "/img_" +dcm_image.replace('.dcm','.png'),img) # write png image


convert_dicom_to_png(input_dir = cmmd_dir + 'cmmd_data/TRAIN/',
                     output_dir = cmmd_dir + 'cmmd_data/PNG/TRAIN/')    
    
convert_dicom_to_png(input_dir = cmmd_dir + 'cmmd_data/TEST/',
                     output_dir = cmmd_dir + 'cmmd_data/PNG/TEST/')    

convert_dicom_to_png(input_dir = cmmd_dir + 'cmmd_data/VAL/',
                     output_dir = cmmd_dir + 'cmmd_data/PNG/VAL/') 

#Tidy up directory
rm_dir(cmmd_dir+"cmmd_data/TRAIN/")
rm_dir(cmmd_dir+"cmmd_data/TEST/")
rm_dir(cmmd_dir+"cmmd_data/VAL/")
rm_dir(cmmd_manifest_directory)
print("Data converted to PNG and filesystem is tidied.")

Working on benign for 
/media/craig/Larry/python/cmmd_data/PNG/TRAIN/


100%|██████████| 706/706 [00:20<00:00, 35.20it/s]


Working on malignant for 
/media/craig/Larry/python/cmmd_data/PNG/TRAIN/


100%|██████████| 1446/1446 [00:42<00:00, 34.08it/s]


Working on benign for 
/media/craig/Larry/python/cmmd_data/PNG/TEST/


100%|██████████| 222/222 [00:06<00:00, 34.39it/s]


Working on malignant for 
/media/craig/Larry/python/cmmd_data/PNG/TEST/


100%|██████████| 454/454 [00:13<00:00, 33.03it/s]


Working on benign for 
/media/craig/Larry/python/cmmd_data/PNG/VAL/


100%|██████████| 178/178 [00:05<00:00, 32.98it/s]


Working on malignant for 
/media/craig/Larry/python/cmmd_data/PNG/VAL/


100%|██████████| 362/362 [00:11<00:00, 32.18it/s]


Data converted to PNG and filesystem is tidied.
