In [1]:
import cv2
import datetime as dt
import h5py
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import numpy as np
import os
import pandas as pd
from glob import glob


In [15]:
def proc_images_optional_resize(pixels,resize = False):
    """
    Saves compressed, resized images as HDF5 datsets
    Returns
        data.h5, where each dataset is an image or class label
        e.g. X23,y23 = image and corresponding class label
    """
    start = dt.datetime.now()
    # ./source_dataset/
    PATH = ""
    if (resize):
        PATH = os.path.abspath(os.path.join('.', 'sources', 'source_dataset'))
    else:
        PATH = os.path.abspath(os.path.join('.', 'sources', 'dataset_' + str(pixels) ))
    
    # ../source_dataset/*/PNEUMONIA/
    SOURCE_IMAGES_PNEUMONIA = os.path.join(PATH, "*", "PNEUMONIA")
    # ../source_dataset/*/PNEUMONIA/*.png
    images_pneumonia = glob(os.path.join(SOURCE_IMAGES_PNEUMONIA, "*.jpeg"))
    
    # ../source_dataset/*/NORMAL/
    SOURCE_IMAGES_NORMAL = os.path.join(PATH, "*", "NORMAL")
    # ../source_dataset/*/NORMAL/*.png
    images_normal = glob(os.path.join(SOURCE_IMAGES_NORMAL, "*.jpeg"))
    
    # Load labels
    #labels = pd.read_csv('../input/sample_labels.csv')
       
    # Set the disease type you want to look for
    disease="PNEUMONIA"
    
    # Size of data
    NUM_IMAGES = len(images_normal) + len(images_pneumonia)
    HEIGHT = pixels
    WIDTH = pixels
    CHANNELS = 3
    SHAPE = (HEIGHT, WIDTH, CHANNELS)
    
    images_normal_resized = []
    images_pneumonia_resized = []
    
    with h5py.File('./datasets/data_complete_' + str(pixels) + '.h5', 'w') as hf:
        for i,img in enumerate(images_normal):            
            # Images
            image = cv2.imread(img)
            if (resize):
                image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
                cv2.imwrite(os.path.join("./sources/dataset_" + str(pixels) , img , ".jpeg"),image)
            images_normal_resized.append(image)
            end=dt.datetime.now()
            print("\r", i, ": ", (end-start).seconds, "seconds", end="")    
        for i,img in enumerate(images_pneumonia):            
            image = cv2.imread(img)
            if (resize):
                image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
                cv2.imwrite(os.path.join("./sources/dataset_" + str(pixels) , img , ".jpeg"),image)
            images_pneumonia_resized.append(image)
            end=dt.datetime.now()
            print("\r", i+len(images_normal), ": ", (end-start).seconds, "seconds", end="")
        
        # Dataset de X_norm
        num_norm = len(images_normal_resized)        
        Xset = hf.create_dataset(
            name='X_norm',
            data=[images_normal_resized],
            shape=(num_norm, HEIGHT, WIDTH, CHANNELS),
            )
        #base = os.path.basename(img)
        
        # Dataset de Y_norm
        yset = hf.create_dataset(
            name='Y_norm',
            data=np.zeros(num_norm),
            shape=(num_norm,1)
            )
        #diagnosis = 0
        #yset = diagnosis
        
        
        # Dataset de X_pneum
        num_pneum = len(images_pneumonia_resized)
        Xset = hf.create_dataset(
            name='X_pneum',
            data=[images_pneumonia_resized],
            shape=(num_pneum, HEIGHT, WIDTH, CHANNELS),
            )
        #base = os.path.basename(img)
        
        # Dataset de Y_pneum
        yset = hf.create_dataset(
            name='Y_pneum',
            data=np.ones(num_pneum),
            shape=(num_pneum,1)
            )
        #diagnosis = 1
        #yset = diagnosis
        hf.close()

In [17]:
proc_images_optional_resize(64,True)

 5855 :  45 seconds

In [16]:
with h5py.File("./datasets/data_complete.h5","r") as hdf:
    tmp = np.array(hdf["X_pneum"],np.ubyte)
    hdf.close()

In [17]:
tmp

array([[[[ 21,  21,  21],
         [ 37,  37,  37],
         [ 42,  42,  42],
         ...,
         [102, 102, 102],
         [ 95,  95,  95],
         [ 89,  89,  89]],

        [[ 18,  18,  18],
         [ 36,  36,  36],
         [ 43,  43,  43],
         ...,
         [105, 105, 105],
         [ 95,  95,  95],
         [ 86,  86,  86]],

        [[ 14,  14,  14],
         [ 34,  34,  34],
         [ 44,  44,  44],
         ...,
         [107, 107, 107],
         [ 94,  94,  94],
         [ 79,  79,  79]],

        ...,

        [[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [  5,   5,   5],
         [  0,   0,   0],
         [  0,   0,   0]],

        [[  0,   0,   0],
         [  3,   3,   3],
         [  0,   0,   0],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0]],

        [[  3,   3,   3],
         [ 10,  10,  10],
         [  0,   0,   0],
         ...,
         [  0,   0,   0],
        

In [18]:
data = h5py.File('datasets/data_complete.h5', "r")
data_x_norm = np.array(data["X_norm"],np.ubyte)
print("data_x_norm: ",data_x_norm.shape)
data_x_pneum = np.array(data["X_pneum"],np.ubyte)
print("data_x_pneum: ",data_x_pneum.shape)
data_x = np.concatenate((data_x_norm,data_x_pneum), axis=0)
print("data_x: ",data_x.shape)
data_y_norm = np.array(data["Y_norm"],np.ubyte)
print("data_y_norm: ",data_y_norm.shape)
data_y_pneum = np.array(data["Y_pneum"],np.ubyte)
print("data_y_pneum: ",data_y_pneum.shape)
data_y = np.concatenate((data_y_norm,data_y_pneum), axis=0)
print("data_y: ",data_y.shape)
data.close()

data_x_norm:  (1583, 64, 64, 3)
data_x_pneum:  (4273, 64, 64, 3)
data_x:  (5856, 64, 64, 3)
data_y_norm:  (1583, 1)
data_y_pneum:  (4273, 1)
data_y:  (5856, 1)


In [20]:
ax1 = data_x.reshape(data_x.shape[0],64*64*3)
ax2 = data_y
print("ax1: ", ax1.shape)
print("ax2: ", ax2.shape)
try1 = np.hstack((ax1,ax2))
try1.shape

ax1:  (5856, 12288)
ax2:  (5856, 1)


(5856, 12289)

In [21]:
#print("data_x: ", data_x)
print("data_y: ", data_y)

s = np.arange(data_x.shape[0])
np.random.shuffle(s)

shuffled_x = data_x[s]
shuffled_y = data_y[s]
print("data_x: ", shuffled_x)
print("data_y: ", shuffled_y)

data_y:  [[0]
 [0]
 [0]
 ...
 [1]
 [1]
 [1]]
data_x:  [[[[ 59  59  59]
   [ 86  86  86]
   [ 56  56  56]
   ...
   [ 43  43  43]
   [ 62  62  62]
   [ 78  78  78]]

  [[132 132 132]
   [147 147 147]
   [ 44  44  44]
   ...
   [ 36  36  36]
   [ 46  46  46]
   [ 56  56  56]]

  [[136 136 136]
   [107 107 107]
   [ 19  19  19]
   ...
   [ 33  33  33]
   [ 34  34  34]
   [ 36  36  36]]

  ...

  [[ 14  14  14]
   [ 35  35  35]
   [ 46  46  46]
   ...
   [ 26  26  26]
   [ 28  28  28]
   [ 25  25  25]]

  [[ 16  16  16]
   [ 35  35  35]
   [ 43  43  43]
   ...
   [ 26  26  26]
   [ 28  28  28]
   [ 25  25  25]]

  [[ 17  17  17]
   [ 34  34  34]
   [ 40  40  40]
   ...
   [ 26  26  26]
   [ 28  28  28]
   [ 25  25  25]]]


 [[[  0   0   0]
   [  0   0   0]
   [  7   7   7]
   ...
   [ 45  45  45]
   [ 45  45  45]
   [ 54  54  54]]

  [[  7   7   7]
   [ 14  14  14]
   [ 14  14  14]
   ...
   [ 58  58  58]
   [ 63  63  63]
   [ 74  74  74]]

  [[  0   0   0]
   [ 12  12  12]
   [ 10  10  10

In [22]:
train_set_x_orig, test_set_x_orig = np.split(shuffled_x.sample(frac=1, random_state=1729),[int(0.9 * len(shuffled_x))])

AttributeError: 'numpy.ndarray' object has no attribute 'sample'