In [1]:
import ipdb
import imageio
import cv2
import argparse
import torch
import numpy as np
import json
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
from numpy.random import shuffle

In [25]:
def process_chunks(chunks, images_dir, labels_dir, raw_images_dir, descriptions_dir):
    descriptions_path = 'raw_data/Descriptions'

    # we only want the images that have a description
    image_IDs = [f for f in listdir(descriptions_path)\
                        if isfile(join(descriptions_path, f))]
    
    # filter out the images with missing or opaquely-formatted clinical data
    unusable = []
    for ID in image_IDs:
        with open (join(descriptions_path, ID), "r") as file:
            data = file.read().replace('\n', '')
            try:
                json.loads(data)['meta']['clinical']['benign_malignant']
            except:
                unusable.append(ID)

    image_IDs = list(set(image_IDs) - set(unusable))
    image_IDs.sort()
    
    # a little redundancy here to ensure that each image with usable clinical data
    # is also represented in the raw_data/Images folder.
    images_path = 'raw_data/Images'
    image_filenames = [join(images_path, f) for f in listdir(images_path)\
                       if (f[:12] in image_IDs)]
    image_filenames.sort()

    descriptions_path = 'raw_data/Descriptions'
    description_filenames = [join(descriptions_path, f) for f in image_IDs]
    
    # permuting the dataset removes some of the class imbalance among the chunks
    np.random.seed(20)
    X = np.asarray([description_filenames, image_filenames]).T
    shuffle(X)
    description_filenames = X[:,0]
    image_filenames = X[:,1]
    
    if compute_aspect_ratio:
        aspect_ratio = estimate_aspect_ratio(image_filenames)
    else:
        aspect_ratio = 0.7105451408210631

    n = len(image_IDs)
    chunk_size = n//chunks
    
    for chunk in range(chunks):
        load_chunk(chunk, image_filenames, description_filenames, chunk_size)

In [7]:
# compute an estimate of the mean aspect ratio
def estimate_aspect_ratio(image_filenames):
    ratios = []
    np.random.seed(20)
    sample = np.random.choice(image_filenames, 1000)
    for filename in sample:
        x = imageio.imread(filename)
        ratios.append(x.shape[0]/x.shape[1])
    aspect_ratio = np.mean(np.asarray(ratios))
    return aspect_ratio

In [8]:
def load_chunk(chunk, image_filenames, description_filenames, chunk_size):
    image_numbers = list(range(chunk*chunk_size, (chunk+1)*chunk_size))
    if chunk==9:
        image_numbers = list(range(chunk*chunk_size, n))

    X = torch.empty(size=(len(image_numbers), 216, int(216/aspect_ratio), 3))
    Y = []
    for sample_idx, idx in enumerate(tqdm(image_numbers)):
        # resize the images to the computed mean aspect ratio using cv2
        img = cv2.imread(image_filenames[idx])
        res = cv2.resize(img, dsize=(int(216/aspect_ratio), 216),\
                         interpolation=cv2.INTER_CUBIC)
        X[sample_idx] = torch.tensor(res)
        with open (description_filenames[idx], "r") as file:
            data = file.read().replace('\n', '')
            Y.append(json.loads(data)['meta']['clinical']['benign_malignant'])
    Y = torch.tensor([1 if diagnosis=='malignant' else 0 for diagnosis in Y])
    X = X.permute(0,3,1,2)
    print("Finished chunk " + str(chunk))
    torch.save(X, 'data/images-' + str(chunk) + '.pt')
    torch.save(Y, 'data/labels-' + str(chunk) + '.pt')

In [14]:
def confirm_arguments(args):
    print('You have decided to do the following:')
    if args.chunks is None:
        print('Process data in 10 chunks')
    else:
        print('Process data in {0} elements'.format(args.chunks))

    if args.images_dir is None:
        print('Images tensor chunks will be downloaded to "data" directory.')
    else:
        print('Images tensor chunks be downloaded to "{0}" directory.'.format(args.images_dir))
        
    if args.labels_dir is None:
        print('Images labels will be downloaded to "data" directory.')
    else:
        print('Images labels will be downloaded to "{0}" directory.'.format(args.labels_dir))

    res = input('Do you confirm your choices? [Y/n] ')

    while res not in ['y', '', 'n']:
        res = input('Invalid input. Do you confirm your choices? [Y/n] ')
    if res in ['y', '']:
        return True
    if res == 'n':
        return False

In [22]:
def parse_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--chunks', type=int,
                        help='The number of chunks into which the raw image dataset will be broken up.'
                        'The last chunk will be used for testing. If this argument is passed, the'
                        'the entire dataset will be re-partitioned.', default=10)
    parser.add_argument('--images_dir', type=int,
                        help='The directory into which the image tensors chunks will be downloaded.',\
                        default='data')
    parser.add_argument('--labels_dir', type=int,
                        help='The directory into which the labels for the corresponding image tensor chunk'
                        ' will be downloaded.', default='data')
    parser.add_argument('--raw_images_dir', type=int,
                        help='Where the raw images are located.',\
                        default=join('raw_data','Images'))
    parser.add_argument('--descriptions_dir', type=int,
                        help='Where the verbose image descriptions are located.',\
                        default=join('raw_data','Descriptions'))
    parser.add_argument('--aspect_ratio', help='Whether to recompute the mean aspect ratio.', action="store_true")
    parsed_args = parser.parse_args(args)
    return parsed_args

In [26]:
def main(args):
    args = parse_args(args)
    has_confirmed = confirm_arguments(args)

    if has_confirmed:
        process_chunks(chunks=args.chunks, images_dir=args.images_dir, labels_dir=args.labels_dir,\
                      raw_images_dir = args.raw_images_dir, descriptions_dir = args.descriptions_dir,
                      aspect_ratio = args.aspect_ratio)
    else:
        print('Exiting without downloading anything')