In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from six.moves import urllib
from sklearn.model_selection import train_test_split
from io import BytesIO

import tempfile
import os
import urllib
import urllib.request
import requests

In [2]:
DATA_DIR = os.path.join(tempfile.gettempdir(), 'haemorrhage_data')

DATA_URL = "https://storage.googleapis.com/lovelace-data/subset"
CSV_FILE = "train_labels.csv"
IMAGE_LOCATION = "images/"
NPZ_FILE = "full_data.npz"

NPZ_URL = 'https://storage.googleapis.com/lovelace-data/subset/full_data.npz'

CSV_URL = '%s/%s' % (DATA_URL, CSV_FILE)
IMAGES_URL = '%s/%s' % (DATA_URL, IMAGE_LOCATION)

In [3]:
def _download_file(filename, url):
    temp_file, _ = urllib.request.urlretrieve(url)
    temp = open(temp_file)
    f = open(filename, "w")
    f.write(temp.read())
    f.close()
    temp.close()

In [4]:
def download_csv(data_dir):
    tf.io.gfile.makedirs(data_dir)

    csv_file_path = os.path.join(data_dir, CSV_FILE)
    if not tf.io.gfile.exists(csv_file_path):
        _download_file(csv_file_path, CSV_URL)

    return csv_file_path

In [5]:
def download_img(data_dir, image_name):
    tf.io.gfile.makedirs(data_dir)
    
    image_file_path = os.path.join(data_dir, IMAGE_LOCATION)
    
    FULL_URL = "%s%s.png" %(IMAGES_URL, image_name)
    save_loc = '%s/%s.png' % (image_file_path, image_name)
    if not tf.io.gfile.exists(save_loc):
        urllib.request.urlretrieve(FULL_URL, save_loc)
    return image_file_path
    

In [6]:
def append_png(image):
    return image + '.png'

def create_encoder_mapping(data):
    labels = set()
    for i in range(len(data)):
        labels.update(data['Tags'][i].split(' '))

    labels = list(labels)
    labels.sort()

    labels_dict = {labels[i]: i for i in range(len(labels))}
    inv_map = {v: k for k, v in labels_dict.items()}
    return labels_dict, inv_map

def encode(tags, mapping):
    encoding = np.zeros(len(mapping), dtype='uint8')
    tags_list = tags.split(' ')
    for tag in tags_list:
        encoding[mapping[tag]] = 1
    return encoding.tolist()

def encode_data(data):
    data.fillna('none', inplace=True)
    labels_dict, inv_map = create_encoder_mapping(data)
    data['EncodedTag'] = data.apply(lambda row: encode(row['Tags'], labels_dict), axis=1)
    data['ImageNo'] = data['ImageNo'].apply(append_png)
    return labels_dict, inv_map

In [7]:
def get_image_arr(base_path):
    images = list()
    for image in os.listdir(base_path):
        pic = tf.keras.preprocessing.image.load_img(base_path + image, color_mode='grayscale', target_size=(224, 244))
        pic = tf.keras.preprocessing.image.img_to_array(pic, dtype='uint8')
        images.append(pic)
    return np.asarray(images, dtype='uint8')

In [7]:
def _load_data():
    csv_file_path = download_csv(DATA_DIR)
    df = pd.read_csv(csv_file_path)
    path = df['ImageNo'].apply(lambda x: download_img(DATA_DIR, x))
    
    mapping, inv_mapping = encode_data(df)
    image_arr = get_image_arr(path[0])
    labels = df['EncodedTag'].values
    labels = np.stack(labels, axis = 0)

    np.savez_compressed('full_data.npz', image_arr, labels)
    return image_arr, labels

def download_npz(data_dir):
    path = './full_data.npz'
    
    urllib.request.urlretrieve('https://storage.googleapis.com/lovelace-data/subset/full_data.npz', path)

    data = np.load(path)
    image, label = data['arr_0'], data['arr_1']
    return image, label

# def load_data():
#     request = requests.get(NPZ_URL)
#     if request.status_code == 201:
#         image, label = download_npz(DATA_DIR)
#     else:
#         image, label = _load_data()
    
#     print("Data shapes: {0}, {1}".format(image.shape, label.shape))

#     x_train, y_train, x_test, y_test = train_test_split(image, label, random_state=42, test_size=0.3)
#     return x_train, y_train, x_test, y_test

def load_data():
    csv_file_path = download_csv(DATA_DIR)
    df = pd.read_csv(csv_file_path)
    path = df['ImageNo'].apply(lambda x: download_img(DATA_DIR, x))
    
    mapping, inv_mapping = encode_data(df)
    print(mapping)
    image_arr = get_image_arr(path[0])
    labels = df['EncodedTag'].values
    labels = np.stack(labels, axis = 0)
    print(image_arr.shape, labels.shape)

    np.savez_compressed('brains.npz', image_arr, labels)
    data = np.load('brains.npz')
    image, label = data['arr_0'], data['arr_1']
    print("Loaded: {0}, {1}".format(image.shape, label.shape))
    
    x_train, y_train, x_test, y_test = train_test_split(image, label, random_state=42, test_size=0.3)
    return x_train, y_train, x_test, y_test
    

In [8]:
load_data()

FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/pb/tgdt80v172g761wzt9j1p_bh0000gn/T/haemorrhage_data/images//ID_000039fa0.png'

In [14]:
print(labels_dict)

NameError: name 'labels_dict' is not defined

In [18]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import tensorflow as tf
import multiprocessing
from sklearn.model_selection import train_test_split
from functools import partial

import tempfile
import os
import urllib
import urllib.request
import requests
import time
import logging
import gc

# from . import upload_file

DATA_DIR = os.path.join(tempfile.gettempdir(), 'haemorrhage_data/{0}/')

DATA_URL = "https://storage.googleapis.com/lovelace-data/{0}"
CSV_FILE = "train_labels.csv"
IMAGE_LOCATION = "images/"
NPZ_FILE = "full_data.npz"

NPZ_URL = 'https://storage.googleapis.com/lovelace-data/{0}/full_data.npz'

CSV_URL = '%s/%s' % (DATA_URL, CSV_FILE)
IMAGES_URL = '%s/%s' % (DATA_URL, IMAGE_LOCATION)

TAG_MAPPING = {
	'none': 0,
	'epidural': 1,
	'intraparenchymal': 2,
	'intraventricular': 3,
	'subarachnoid': 4,
	'subdural': 5
}

log = logging.getLogger(__name__)


def _download_file(filename, url):
	temp_file, _ = urllib.request.urlretrieve(url)
	temp = open(temp_file)
	f = open(filename, "w")
	f.write(temp.read())
	f.close()
	temp.close()


def download_csv(data_dir, arg_data):
	tf.io.gfile.makedirs(data_dir)

	csv_file_path = os.path.join(data_dir, CSV_FILE)
	if not tf.io.gfile.exists(csv_file_path):
		log.info('Downloading file...')
		_download_file(csv_file_path, CSV_URL.format(arg_data))

	log.info('Returned path')
	return csv_file_path


def download_img(image_name, image_file_path, arg_data):
	retries = 15
	log.info('Downloading Image: {0}'.format(image_name))
	full_url = "%s%s.png" % (IMAGES_URL.format(arg_data), image_name)
	save_loc = '%s%s.png' % (image_file_path, image_name)
	if not tf.io.gfile.exists(save_loc):
		while retries > 0:
			try:
				urllib.request.urlretrieve(full_url, save_loc)
				break
			except:
				log.warning('Retrying {0}'.format(full_url))
				retries = retries - 1
				time.sleep(1)


def append_png(image):
	return image + '.png'


# def create_encoder_mapping(data):
# 	labels = set()
# 	for i in range(len(data)):
# 		labels.update(data['Tags'][i].split(' '))
#
# 	labels = list(labels)
# 	labels.sort()
#
# 	labels_dict = {labels[i]: i for i in range(len(labels))}
# 	inv_map = {v: k for k, v in labels_dict.items()}
# 	print(labels_dict)
# 	return labels_dict, inv_map


def encode(tags):
	encoding = np.zeros(len(TAG_MAPPING), dtype='uint8')
	tags_list = tags.split(' ')
	for tag in tags_list:
		encoding[TAG_MAPPING[tag]] = 1
	return encoding.tolist()


def encode_data(data):
	data.fillna('none', inplace=True)
	data['EncodedTag'] = data.apply(lambda row: encode(row['Tags']), axis=1)
	data['ImageNo'] = data['ImageNo'].apply(append_png)


def get_image_arr(image, base_path):
	pic = tf.keras.preprocessing.image.load_img(base_path + image, color_mode='grayscale', target_size=(224, 224))
	pic = tf.keras.preprocessing.image.img_to_array(pic, dtype='uint8')
	return pic


def __load_data(data_arg):
	log.info('Downloading CSV')
	csv_file_path = download_csv(DATA_DIR.format(data_arg), data_arg)
	df = pd.read_csv(csv_file_path)

	image_file_path = os.path.join(DATA_DIR.format(data_arg), IMAGE_LOCATION)
	print(image_file_path)

	cpu_count = multiprocessing.cpu_count()
	log.info("CPU Count: {0}".format(cpu_count))
	pool = multiprocessing.Pool(cpu_count * 2)

	print('downloading images')
	if data_arg == "subset":
		tf.io.gfile.makedirs(image_file_path)
		log.info('Downloading Images')
		download_func = partial(download_img, image_file_path=image_file_path, arg_data=data_arg)
		pool.map(download_func, df['ImageNo'].values)
	else:
		log.info('Downloading and extracting .tgz')
		start = time.time()
		_ = tf.keras.utils.get_file("images",
		                            origin="https://storage.googleapis.com/lovelace-data/all/images.tgz",
		                            extract=True)
		image_file_path = "/root/.keras/datasets/train_images/"
		print("time take: {0}".format(time.time() - start))
	print('done downloading')

	pool.close()
	print(image_file_path)
	log.info('Encoding labels')
	encode_data(df)

	pool = multiprocessing.Pool(cpu_count * 2)
	log.info('Getting Image Data')
	image_arr_func = partial(get_image_arr, base_path=image_file_path)
	images = df['ImageNo'].values
	image_arr = pool.map(image_arr_func, images)
	image_arr = np.asarray(image_arr, dtype='uint8')

	labels = df['EncodedTag'].values
	labels = np.stack(labels, axis=0)
	print(labels.shape)
	print(labels[0])

	log.info('Saving NPZ...')
	np.savez_compressed('full_data.npz', image_arr, labels)
	log.info('Uploading to cloud')
	os.system("gsutil cp full_data.npz gs://lovelace-data/all/")
	# upload_file.upload_file('lovelace-data', 'full_data.npz', data_arg)
	return image_arr, labels


def download_npz(data_arg):
	path = './full_data.npz'

	urllib.request.urlretrieve('https://storage.googleapis.com/lovelace-data/{0}/full_data.npz'.format(data_arg), path)

	data = np.load(path)
	image, label = data['arr_0'], data['arr_1']
	return image, label


def load_data(data_arg):
	request = requests.get(NPZ_URL.format(data_arg))
	if request.status_code == 201:
		log.info('Loading NPZ')
		image, label = download_npz(data_arg)
	else:
		log.info('Loading CSV and images')
		image, label = __load_data(data_arg)

	log.info("Data shapes: {0}, {1}".format(image.shape, label.shape))
	image = np.repeat(image[..., np.newaxis], 3, -1)
	image = image[:, :, :, 0, :]
	log.info(image.shape)

	x_train, x_test, y_train, y_test = train_test_split(image, label, random_state=42, test_size=0.3)
	del image, label
	gc.collect()
	x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state=42, test_size=0.2)
	return x_train, y_train, x_test, y_test, x_val, y_val

load_data('subset')

/var/folders/pb/tgdt80v172g761wzt9j1p_bh0000gn/T/haemorrhage_data/subset/images/
downloading images
done downloading
/var/folders/pb/tgdt80v172g761wzt9j1p_bh0000gn/T/haemorrhage_data/subset/images/
(6, 3000)
[1 1 1 ... 1 1 1]


ValueError: Found input variables with inconsistent numbers of samples: [3000, 6]