In [4]:
import cv2
import datetime as dt
import h5py
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import numpy as np
import os
import pandas as pd
from glob import glob


In [6]:
def proc_images_normal():
    """
    Saves compressed, resized images as HDF5 datsets
    Returns
        data.h5, where each dataset is an image or class label
        e.g. X23,y23 = image and corresponding class label
    """
    start = dt.datetime.now()
    # ./source_dataset/
    PATH = os.path.abspath(os.path.join('.', 'source_dataset'))
    # ../source_dataset/*/NORMAL/
    SOURCE_IMAGES = os.path.join(PATH, "*", "NORMAL")
    # ../source_dataset/*/NORMAL/*.png
    images = glob(os.path.join(SOURCE_IMAGES, "*.jpeg"))
    # Load labels
    #labels = pd.read_csv('../input/sample_labels.csv')
       
    # Set the disease type you want to look for
    disease="NORMAL"
    
    # Size of data
    NUM_IMAGES = len(images)
    HEIGHT = 256
    WIDTH = 256
    CHANNELS = 3
    SHAPE = (HEIGHT, WIDTH, CHANNELS)
    
    with h5py.File('data_normal.h5', 'w') as hf:
        groupX = hf.create_group("Xset")
        groupY = hf.create_group("Yset")
        for i,img in enumerate(images):            
            # Images
            image = cv2.imread(img)
            image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            Xset = groupX.create_dataset(
                name='X'+str(i),
                data=image,
                shape=(HEIGHT, WIDTH, CHANNELS),
                maxshape=(HEIGHT, WIDTH, CHANNELS),
                compression="gzip",
                compression_opts=9)
            # Labels
            base = os.path.basename(img)
            #diagnosis = labels["Finding Labels"][labels["Image Index"] == base].values[0]
            yset = groupY.create_dataset(
                name='y'+str(i),
                shape=(1,),
                maxshape=(None,),
                compression="gzip",
                compression_opts=9)
            #if disease in finding:
            #diagnosis = 1
            #yset = diagnosis
            #else:
            diagnosis = 0
            yset = diagnosis
            end=dt.datetime.now()
            print("\r", i, ": ", (end-start).seconds, "seconds", end="")

In [7]:
def proc_images_pneumonia():
    """
    Saves compressed, resized images as HDF5 datsets
    Returns
        data.h5, where each dataset is an image or class label
        e.g. X23,y23 = image and corresponding class label
    """
    start = dt.datetime.now()
    # ./source_dataset/
    PATH = os.path.abspath(os.path.join('.', 'source_dataset'))
    # ../source_dataset/*/PNEUMONIA/
    SOURCE_IMAGES = os.path.join(PATH, "*", "PNEUMONIA")
    # ../source_dataset/*/PNEUMONIA/*.png
    images = glob(os.path.join(SOURCE_IMAGES, "*.jpeg"))
    # Load labels
    #labels = pd.read_csv('../input/sample_labels.csv')
       
    # Set the disease type you want to look for
    disease="PNEUMONIA"
    
    # Size of data
    NUM_IMAGES = len(images)
    HEIGHT = 256
    WIDTH = 256
    CHANNELS = 3
    SHAPE = (HEIGHT, WIDTH, CHANNELS)
    
    with h5py.File('data_pneumonia.h5', 'w') as hf:
        groupX = hf.create_group("Xset")
        groupY = hf.create_group("Yset")
        for i,img in enumerate(images):            
            # Images
            image = cv2.imread(img)
            image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            Xset = groupX.create_dataset(
                name='X'+str(i),
                data=image,
                shape=(HEIGHT, WIDTH, CHANNELS),
                maxshape=(HEIGHT, WIDTH, CHANNELS),
                compression="gzip",
                compression_opts=9)
            # Labels
            base = os.path.basename(img)
            #diagnosis = labels["Finding Labels"][labels["Image Index"] == base].values[0]
            yset = groupY.create_dataset(
                name='y'+str(i),
                shape=(1,),
                maxshape=(None,),
                compression="gzip",
                compression_opts=9)
            #if disease in finding:
            diagnosis = 1
            #yset = diagnosis
            #else:
            #diagnosis = 0
            yset = diagnosis
            end=dt.datetime.now()
            print("\r", i, ": ", (end-start).seconds, "seconds", end="")

In [8]:
proc_images_normal()

 211 :  6 seconds

KeyboardInterrupt: 

In [None]:
proc_images_pneumonia()

In [9]:
!ls -lha

total 1.8G
drwxr-xr-x 8 elizondo elizondo 4.0K may 15 19:46  .
drwxr-xr-x 3 elizondo elizondo 4.0K may 14 20:40  ..
-rw-r--r-- 1 elizondo elizondo 1.2G mar 24  2018  chest_xray.zip
-rw-r--r-- 1 elizondo elizondo  32M may 15 19:47  data_normal.h5
-rw-r--r-- 1 elizondo elizondo 573M may 14 23:20  data_pneumonia.h5
drwxr-xr-x 2 elizondo elizondo 4.0K may 14 22:28  datasets
-rwxrwxrwx 1 elizondo elizondo  60K may 14 22:09  DL-App.ipynb
-rwxrwxrwx 1 elizondo elizondo  15K may 14 22:55  dnn_app_utils_v3.py
drwxr-xr-x 8 elizondo elizondo 4.0K may 14 23:45  .git
-rw-r--r-- 1 elizondo elizondo   72 may 14 23:36  .gitignore
drwxrwxrwx 2 elizondo elizondo 4.0K may 15 00:00  images
drwxr-xr-x 2 elizondo elizondo 4.0K may 14 22:07  .ipynb_checkpoints
drwxr-xr-x 2 elizondo elizondo 4.0K may 14 22:07  __pycache__
-rw-r--r-- 1 elizondo elizondo  44K may 15 19:46 'Resize and convert images into h5.ipynb'
drwxr-xr-x 5 elizondo elizondo 4.0K may 14 21:54  source_dataset


In [66]:
data_normal = h5py.File('data_normal.h5', "r")
data_pneumonia = h5py.File('datasets/data_pneumonia.h5', "r")
data_merged = h5py.File('datasets/data_merged.h5', "r")

In [69]:
data_merged_x = data_normal["Xset"]
data_merged_y = data_normal["Yset"]
print(data_merged_x)
print(data_merged_y)

<HDF5 group "/Xset" (1583 members)>
<HDF5 group "/Yset" (1583 members)>


In [79]:
print(data_merged_x)

AttributeError: 'int' object has no attribute 'encode'

In [82]:
def get_all(name):
   print(name)

data_merged_x.visit(get_all)

X0
X1
X10
X100
X1000
X1001
X1002
X1003
X1004
X1005
X1006
X1007
X1008
X1009
X101
X1010
X1011
X1012
X1013
X1014
X1015
X1016
X1017
X1018
X1019
X102
X1020
X1021
X1022
X1023
X1024
X1025
X1026
X1027
X1028
X1029
X103
X1030
X1031
X1032
X1033
X1034
X1035
X1036
X1037
X1038
X1039
X104
X1040
X1041
X1042
X1043
X1044
X1045
X1046
X1047
X1048
X1049
X105
X1050
X1051
X1052
X1053
X1054
X1055
X1056
X1057
X1058
X1059
X106
X1060
X1061
X1062
X1063
X1064
X1065
X1066
X1067
X1068
X1069
X107
X1070
X1071
X1072
X1073
X1074
X1075
X1076
X1077
X1078
X1079
X108
X1080
X1081
X1082
X1083
X1084
X1085
X1086
X1087
X1088
X1089
X109
X1090
X1091
X1092
X1093
X1094
X1095
X1096
X1097
X1098
X1099
X11
X110
X1100
X1101
X1102
X1103
X1104
X1105
X1106
X1107
X1108
X1109
X111
X1110
X1111
X1112
X1113
X1114
X1115
X1116
X1117
X1118
X1119
X112
X1120
X1121
X1122
X1123
X1124
X1125
X1126
X1127
X1128
X1129
X113
X1130
X1131
X1132
X1133
X1134
X1135
X1136
X1137
X1138
X1139
X114
X1140
X1141
X1142
X1143
X1144
X1145
X1146
X1147
X1148
X1149
X115
X1150


In [2]:
def proc_images_all():
    """
    Saves compressed, resized images as HDF5 datsets
    Returns
        data.h5, where each dataset is an image or class label
        e.g. X23,y23 = image and corresponding class label
    """
    start = dt.datetime.now()
    # ./source_dataset/
    PATH = os.path.abspath(os.path.join('.', 'source_dataset'))
    
    # ../source_dataset/*/PNEUMONIA/
    SOURCE_IMAGES_PNEUMONIA = os.path.join(PATH, "*", "PNEUMONIA")
    # ../source_dataset/*/PNEUMONIA/*.png
    images_pneumonia = glob(os.path.join(SOURCE_IMAGES_PNEUMONIA, "*.jpeg"))
    
    # ../source_dataset/*/NORMAL/
    SOURCE_IMAGES_NORMAL = os.path.join(PATH, "*", "NORMAL")
    # ../source_dataset/*/NORMAL/*.png
    images_normal = glob(os.path.join(SOURCE_IMAGES_NORMAL, "*.jpeg"))
    
    # Load labels
    #labels = pd.read_csv('../input/sample_labels.csv')
       
    # Set the disease type you want to look for
    disease="PNEUMONIA"
    
    # Size of data
    NUM_IMAGES = len(images_normal) + len(images_pneumonia)
    HEIGHT = 256
    WIDTH = 256
    CHANNELS = 3
    SHAPE = (HEIGHT, WIDTH, CHANNELS)
    
    with h5py.File('data_complete.h5', 'w') as hf:
        groupX = hf.create_group("X")
        groupY = hf.create_group("Y")
        for i,img in enumerate(images_normal):            
            # Images
            image = cv2.imread(img)
            image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            Xset = groupX.create_dataset(
                name='X'+str(i),
                data=image,
                shape=(HEIGHT, WIDTH, CHANNELS),
                maxshape=(HEIGHT, WIDTH, CHANNELS),
                compression="gzip",
                compression_opts=9)
            # Labels
            base = os.path.basename(img)
            #diagnosis = labels["Finding Labels"][labels["Image Index"] == base].values[0]
            yset = groupY.create_dataset(
                name='Y'+str(i),
                shape=(1,),
                maxshape=(None,),
                compression="gzip",
                compression_opts=9)
            #if disease in finding:
            #diagnosis = 1
            #yset = diagnosis
            #else:
            diagnosis = 0
            yset = diagnosis
            end=dt.datetime.now()
            print("\r", i, ": ", (end-start).seconds, "seconds", end="")
        for i,img in enumerate(images_pneumonia):            
            # Images
            image = cv2.imread(img)
            image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            Xset = groupX.create_dataset(
                name='X'+str(i+len(images_normal)),
                data=image,
                shape=(HEIGHT, WIDTH, CHANNELS),
                maxshape=(HEIGHT, WIDTH, CHANNELS),
                compression="gzip",
                compression_opts=9)
            # Labels
            base = os.path.basename(img)
            #diagnosis = labels["Finding Labels"][labels["Image Index"] == base].values[0]
            yset = groupY.create_dataset(
                name='Y'+str(i+len(images_normal)),
                shape=(1,),
                maxshape=(None,),
                compression="gzip",
                compression_opts=9)
            #if disease in finding:
            diagnosis = 1
            #yset = diagnosis
            #else:
            #diagnosis = 0
            yset = diagnosis
            end=dt.datetime.now()
            print("\r", i+len(images_normal), ": ", (end-start).seconds, "seconds", end="")

In [30]:
proc_images_all()

 5855 :  110 seconds

In [16]:
data_complete = h5py.File('data_complete.h5', "r")
def get_all(name):
   print(name)

#data_complete.visit(get_all)
with h5py.File("data_complete.h5","r") as hdf:
    hdf.visit(get_all)

X
X/X0
X/X1
X/X10
X/X100
X/X1000
X/X1001
X/X1002
X/X1003
X/X1004
X/X1005
X/X1006
X/X1007
X/X1008
X/X1009
X/X101
X/X1010
X/X1011
X/X1012
X/X1013
X/X1014
X/X1015
X/X1016
X/X1017
X/X1018
X/X1019
X/X102
X/X1020
X/X1021
X/X1022
X/X1023
X/X1024
X/X1025
X/X1026
X/X1027
X/X1028
X/X1029
X/X103
X/X1030
X/X1031
X/X1032
X/X1033
X/X1034
X/X1035
X/X1036
X/X1037
X/X1038
X/X1039
X/X104
X/X1040
X/X1041
X/X1042
X/X1043
X/X1044
X/X1045
X/X1046
X/X1047
X/X1048
X/X1049
X/X105
X/X1050
X/X1051
X/X1052
X/X1053
X/X1054
X/X1055
X/X1056
X/X1057
X/X1058
X/X1059
X/X106
X/X1060
X/X1061
X/X1062
X/X1063
X/X1064
X/X1065
X/X1066
X/X1067
X/X1068
X/X1069
X/X107
X/X1070
X/X1071
X/X1072
X/X1073
X/X1074
X/X1075
X/X1076
X/X1077
X/X1078
X/X1079
X/X108
X/X1080
X/X1081
X/X1082
X/X1083
X/X1084
X/X1085
X/X1086
X/X1087
X/X1088
X/X1089
X/X109
X/X1090
X/X1091
X/X1092
X/X1093
X/X1094
X/X1095
X/X1096
X/X1097
X/X1098
X/X1099
X/X11
X/X110
X/X1100
X/X1101
X/X1102
X/X1103
X/X1104
X/X1105
X/X1106
X/X1107
X/X1108
X/X1109
X/X111
X/X1110
X/X1

X/X4314
X/X4315
X/X4316
X/X4317
X/X4318
X/X4319
X/X432
X/X4320
X/X4321
X/X4322
X/X4323
X/X4324
X/X4325
X/X4326
X/X4327
X/X4328
X/X4329
X/X433
X/X4330
X/X4331
X/X4332
X/X4333
X/X4334
X/X4335
X/X4336
X/X4337
X/X4338
X/X4339
X/X434
X/X4340
X/X4341
X/X4342
X/X4343
X/X4344
X/X4345
X/X4346
X/X4347
X/X4348
X/X4349
X/X435
X/X4350
X/X4351
X/X4352
X/X4353
X/X4354
X/X4355
X/X4356
X/X4357
X/X4358
X/X4359
X/X436
X/X4360
X/X4361
X/X4362
X/X4363
X/X4364
X/X4365
X/X4366
X/X4367
X/X4368
X/X4369
X/X437
X/X4370
X/X4371
X/X4372
X/X4373
X/X4374
X/X4375
X/X4376
X/X4377
X/X4378
X/X4379
X/X438
X/X4380
X/X4381
X/X4382
X/X4383
X/X4384
X/X4385
X/X4386
X/X4387
X/X4388
X/X4389
X/X439
X/X4390
X/X4391
X/X4392
X/X4393
X/X4394
X/X4395
X/X4396
X/X4397
X/X4398
X/X4399
X/X44
X/X440
X/X4400
X/X4401
X/X4402
X/X4403
X/X4404
X/X4405
X/X4406
X/X4407
X/X4408
X/X4409
X/X441
X/X4410
X/X4411
X/X4412
X/X4413
X/X4414
X/X4415
X/X4416
X/X4417
X/X4418
X/X4419
X/X442
X/X4420
X/X4421
X/X4422
X/X4423
X/X4424
X/X4425
X/X4426
X/X4427
X/X44

Y/Y1775
Y/Y1776
Y/Y1777
Y/Y1778
Y/Y1779
Y/Y178
Y/Y1780
Y/Y1781
Y/Y1782
Y/Y1783
Y/Y1784
Y/Y1785
Y/Y1786
Y/Y1787
Y/Y1788
Y/Y1789
Y/Y179
Y/Y1790
Y/Y1791
Y/Y1792
Y/Y1793
Y/Y1794
Y/Y1795
Y/Y1796
Y/Y1797
Y/Y1798
Y/Y1799
Y/Y18
Y/Y180
Y/Y1800
Y/Y1801
Y/Y1802
Y/Y1803
Y/Y1804
Y/Y1805
Y/Y1806
Y/Y1807
Y/Y1808
Y/Y1809
Y/Y181
Y/Y1810
Y/Y1811
Y/Y1812
Y/Y1813
Y/Y1814
Y/Y1815
Y/Y1816
Y/Y1817
Y/Y1818
Y/Y1819
Y/Y182
Y/Y1820
Y/Y1821
Y/Y1822
Y/Y1823
Y/Y1824
Y/Y1825
Y/Y1826
Y/Y1827
Y/Y1828
Y/Y1829
Y/Y183
Y/Y1830
Y/Y1831
Y/Y1832
Y/Y1833
Y/Y1834
Y/Y1835
Y/Y1836
Y/Y1837
Y/Y1838
Y/Y1839
Y/Y184
Y/Y1840
Y/Y1841
Y/Y1842
Y/Y1843
Y/Y1844
Y/Y1845
Y/Y1846
Y/Y1847
Y/Y1848
Y/Y1849
Y/Y185
Y/Y1850
Y/Y1851
Y/Y1852
Y/Y1853
Y/Y1854
Y/Y1855
Y/Y1856
Y/Y1857
Y/Y1858
Y/Y1859
Y/Y186
Y/Y1860
Y/Y1861
Y/Y1862
Y/Y1863
Y/Y1864
Y/Y1865
Y/Y1866
Y/Y1867
Y/Y1868
Y/Y1869
Y/Y187
Y/Y1870
Y/Y1871
Y/Y1872
Y/Y1873
Y/Y1874
Y/Y1875
Y/Y1876
Y/Y1877
Y/Y1878
Y/Y1879
Y/Y188
Y/Y1880
Y/Y1881
Y/Y1882
Y/Y1883
Y/Y1884
Y/Y1885
Y/Y1886
Y/Y1887
Y/Y1888
Y/Y18

Y/Y5825
Y/Y5826
Y/Y5827
Y/Y5828
Y/Y5829
Y/Y583
Y/Y5830
Y/Y5831
Y/Y5832
Y/Y5833
Y/Y5834
Y/Y5835
Y/Y5836
Y/Y5837
Y/Y5838
Y/Y5839
Y/Y584
Y/Y5840
Y/Y5841
Y/Y5842
Y/Y5843
Y/Y5844
Y/Y5845
Y/Y5846
Y/Y5847
Y/Y5848
Y/Y5849
Y/Y585
Y/Y5850
Y/Y5851
Y/Y5852
Y/Y5853
Y/Y5854
Y/Y5855
Y/Y586
Y/Y587
Y/Y588
Y/Y589
Y/Y59
Y/Y590
Y/Y591
Y/Y592
Y/Y593
Y/Y594
Y/Y595
Y/Y596
Y/Y597
Y/Y598
Y/Y599
Y/Y6
Y/Y60
Y/Y600
Y/Y601
Y/Y602
Y/Y603
Y/Y604
Y/Y605
Y/Y606
Y/Y607
Y/Y608
Y/Y609
Y/Y61
Y/Y610
Y/Y611
Y/Y612
Y/Y613
Y/Y614
Y/Y615
Y/Y616
Y/Y617
Y/Y618
Y/Y619
Y/Y62
Y/Y620
Y/Y621
Y/Y622
Y/Y623
Y/Y624
Y/Y625
Y/Y626
Y/Y627
Y/Y628
Y/Y629
Y/Y63
Y/Y630
Y/Y631
Y/Y632
Y/Y633
Y/Y634
Y/Y635
Y/Y636
Y/Y637
Y/Y638
Y/Y639
Y/Y64
Y/Y640
Y/Y641
Y/Y642
Y/Y643
Y/Y644
Y/Y645
Y/Y646
Y/Y647
Y/Y648
Y/Y649
Y/Y65
Y/Y650
Y/Y651
Y/Y652
Y/Y653
Y/Y654
Y/Y655
Y/Y656
Y/Y657
Y/Y658
Y/Y659
Y/Y66
Y/Y660
Y/Y661
Y/Y662
Y/Y663
Y/Y664
Y/Y665
Y/Y666
Y/Y667
Y/Y668
Y/Y669
Y/Y67
Y/Y670
Y/Y671
Y/Y672
Y/Y673
Y/Y674
Y/Y675
Y/Y676
Y/Y677
Y/Y678
Y/Y679
Y/Y68
Y/Y680
Y

In [148]:
def proc_images_single_dataset():
    """
    Saves compressed, resized images as HDF5 datsets
    Returns
        data.h5, where each dataset is an image or class label
        e.g. X23,y23 = image and corresponding class label
    """
    start = dt.datetime.now()
    # ./source_dataset/
    PATH = os.path.abspath(os.path.join('.', 'small_dataset'))
    
    # ../source_dataset/*/PNEUMONIA/
    SOURCE_IMAGES_PNEUMONIA = os.path.join(PATH, "*", "PNEUMONIA")
    # ../source_dataset/*/PNEUMONIA/*.png
    images_pneumonia = glob(os.path.join(SOURCE_IMAGES_PNEUMONIA, "*.jpeg"))
    
    # ../source_dataset/*/NORMAL/
    SOURCE_IMAGES_NORMAL = os.path.join(PATH, "*", "NORMAL")
    # ../source_dataset/*/NORMAL/*.png
    images_normal = glob(os.path.join(SOURCE_IMAGES_NORMAL, "*.jpeg"))
    
    # Load labels
    #labels = pd.read_csv('../input/sample_labels.csv')
       
    # Set the disease type you want to look for
    disease="PNEUMONIA"
    
    # Size of data
    NUM_IMAGES = len(images_normal) + len(images_pneumonia)
    HEIGHT = 256
    WIDTH = 256
    CHANNELS = 3
    SHAPE = (HEIGHT, WIDTH, CHANNELS)
    
    images_normal_resized = []
    images_pneumonia_resized = []
    
    with h5py.File('./datasets/data_complete.h5', 'w') as hf:
        #groupX = hf.create_group("X")
        #groupY = hf.create_group("Y")
        for i,img in enumerate(images_normal):            
            # Images
            image = cv2.imread(img)
            #image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            #cv2.imwrite(os.path.join("./resized_dataset/", img , ".jpeg")
            images_normal_resized.append(image)
            end=dt.datetime.now()
            print("\r", i, ": ", (end-start).seconds, "seconds", end="")    
        for i,img in enumerate(images_pneumonia):            
            image = cv2.imread(img)
            #image = cv2.resize(image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC)
            images_pneumonia_resized.append(image)
            end=dt.datetime.now()
            print("\r", i+len(images_normal), ": ", (end-start).seconds, "seconds", end="")
        
        # Dataset de X_norm
        num_norm = len(images_normal_resized)        
        Xset = hf.create_dataset(
            name='X_norm',
            data=[images_normal_resized],
            shape=(num_norm, HEIGHT, WIDTH, CHANNELS),
            )
        #base = os.path.basename(img)
        
        # Dataset de Y_norm
        yset = hf.create_dataset(
            name='Y_norm',
            data=np.zeros(num_norm),
            shape=(num_norm,1)
            )
        #diagnosis = 0
        #yset = diagnosis
        
        
        # Dataset de X_pneum
        num_pneum = len(images_pneumonia_resized)
        Xset = hf.create_dataset(
            name='X_pneum',
            data=[images_pneumonia_resized],
            shape=(num_pneum, HEIGHT, WIDTH, CHANNELS),
            )
        #base = os.path.basename(img)
        
        # Dataset de Y_pneum
        yset = hf.create_dataset(
            name='Y_pneum',
            data=np.ones(num_pneum),
            shape=(num_pneum,1)
            )
        #diagnosis = 1
        #yset = diagnosis
        hf.close()

In [149]:
proc_images_single_dataset()

 5855 :  3 seconds

In [150]:
with h5py.File("./datasets/data_complete.h5","r") as hdf:
    tmp = np.array(hdf["X_pneum"],np.ubyte)
    hdf.close()

In [20]:
tmp

array([[[[ 49,  49,  49, ..., 190, 190, 184],
         [184, 184, 191, ..., 131, 118, 118],
         [118, 123, 123, ..., 132, 132, 132]],

        [[123, 123, 123, ..., 135, 135, 129],
         [129, 129, 125, ..., 143, 136, 136],
         [136, 135, 135, ..., 123, 123, 123]],

        [[123, 123, 123, ..., 117, 117, 117],
         [117, 117, 115, ..., 170, 171, 171],
         [171, 178, 178, ...,  34,  34,  34]],

        ..., 
        [[  8,   8,   8, ..., 210, 210, 207],
         [207, 207, 210, ...,   0,   1,   1],
         [  1,   6,   6, ..., 201, 201, 201]],

        [[200, 200, 200, ..., 210, 210, 160],
         [160, 160, 154, ..., 176, 176, 176],
         [176, 177, 177, ..., 115, 115, 115]],

        [[100, 100, 100, ...,  97,  97,  80],
         [ 80,  80,  68, ..., 186, 188, 188],
         [188, 187, 187, ...,   0,   0,   0]]],


       [[[  0,   0,   0, ..., 212, 212, 212],
         [212, 212, 212, ...,  16,  26,  26],
         [ 26,  35,  35, ..., 180, 180, 180]],

    

In [151]:
data = h5py.File('datasets/data_complete.h5', "r")
data_x_norm = np.array(data["X_norm"],np.ubyte)
print("data_x_norm: ",data_x_norm.shape)
data_x_pneum = np.array(data["X_pneum"],np.ubyte)
print("data_x_pneum: ",data_x_pneum.shape)
data_x = np.concatenate((data_x_norm,data_x_pneum), axis=0)
print("data_x: ",data_x.shape)
data_y_norm = np.array(data["Y_norm"],np.ubyte)
print("data_y_norm: ",data_y_norm.shape)
data_y_pneum = np.array(data["Y_pneum"],np.ubyte)
print("data_y_pneum: ",data_y_pneum.shape)
data_y = np.concatenate((data_y_norm,data_y_pneum), axis=0)
print("data_y: ",data_y.shape)
data.close()

data_x_norm:  (1583, 256, 256, 3)
data_x_pneum:  (4273, 256, 256, 3)
data_x:  (5856, 256, 256, 3)
data_y_norm:  (1583, 1)
data_y_pneum:  (4273, 1)
data_y:  (5856, 1)


In [128]:
ax1 = data_x.reshape(data_x.shape[0],256*256*3)
ax2 = data_y
print("ax1: ", ax1.shape)
print("ax2: ", ax2.shape)
try1 = np.hstack((ax1,ax2))
try1.shape

ax1:  (5856, 196608)
ax2:  (5856, 1)


(5856, 196609)

In [158]:
#print("data_x: ", data_x)
print("data_y: ", data_y)

s = np.arange(data_x.shape[0])
np.random.shuffle(s)

shuffled_x = data_x[s]
shuffled_y = data_y[s]
print("data_x: ", shuffled_x)
print("data_y: ", shuffled_y)

data_y:  [[0]
 [0]
 [0]
 ..., 
 [1]
 [1]
 [1]]
data_x:  [[[[214 214 214]
   [211 211 211]
   [209 209 209]
   ..., 
   [187 187 187]
   [198 198 198]
   [202 202 202]]

  [[211 211 211]
   [208 208 208]
   [207 207 207]
   ..., 
   [180 180 180]
   [192 192 192]
   [201 201 201]]

  [[211 211 211]
   [206 206 206]
   [204 204 204]
   ..., 
   [177 177 177]
   [191 191 191]
   [206 206 206]]

  ..., 
  [[ 49  49  49]
   [ 16  16  16]
   [  0   0   0]
   ..., 
   [  0   0   0]
   [  0   0   0]
   [  0   0   0]]

  [[ 46  46  46]
   [ 14  14  14]
   [  0   0   0]
   ..., 
   [  0   0   0]
   [  0   0   0]
   [  0   0   0]]

  [[ 45  45  45]
   [ 13  13  13]
   [  0   0   0]
   ..., 
   [  0   0   0]
   [  0   0   0]
   [  0   0   0]]]


 [[[  1   1   1]
   [  0   0   0]
   [  0   0   0]
   ..., 
   [  0   0   0]
   [  0   0   0]
   [  0   0   0]]

  [[  1   1   1]
   [  0   0   0]
   [  0   0   0]
   ..., 
   [  0   0   0]
   [  0   0   0]
   [  0   0   0]]

  [[  1   1   1]
   [  0   0  

In [159]:
train_set_x_orig, test_set_x_orig = np.split(shuffled_x.sample(frac=1, random_state=1729),[int(0.9 * len(shuffled_x))])

AttributeError: 'numpy.ndarray' object has no attribute 'sample'