In [3]:
from text_processing.NGramCounter import NGramCounter
from text_processing.NGramVectorBuilder import NGramVectorBuilder
from data_module.models import Person, Examination, Description, ImageSeries
from data_module.models import Image as Retina_Image
from random import shuffle
import random, re, PIL, csv, cv2, os, time, sys
import numpy as np
from skimage.transform import resize
np.random.seed(7)
import h5py

In [309]:
class Data_preparer():
    def __init__(self):
        self.load_image_resolution()
    
    def samples_from_file(self, f):
            reader = csv.reader(f.readline().split('\n'), delimiter=',')
            for row in reader:
                train_samples = np.array(row[:-1],dtype='uint8')
                break

            reader = csv.reader(f.readline().split('\n'), delimiter=',')
            for row in reader:
                val_samples = np.array(row[:-1],dtype='uint8')
                break

            reader = csv.reader(f.readline().split('\n'), delimiter=',')
            for row in reader:
                test_samples = np.array(row[:-1],dtype='uint8')
                break
            f.close()
            return train_samples,val_samples,test_samples
    
    def load_image_resolution(self):
        width = 1388
        height = 1038
        
        self.img_size_2 = 100
        self.img_size_1 = int(self.img_size_2 * (height / width))

    def get_images_metadata(self, examinations, do_shuffle = True, file=None):
        data = []
        for examin in examinations:
            sequences = ImageSeries.objects.filter(examination=examin)
            for i in range(len(sequences)):
                if sequences[i].name.endswith("after_registration"):
                    continue
                if sequences[i].name.startswith("left"):
                        y_train = [1,0]
                        y_train_inv = [0,1]
                else:
                        y_train = [0,1]
                        y_train_inv = [1,0]
                imgModels = Retina_Image.objects.filter(image_series=sequences[i])
                for j in range(len(imgModels)):
                    data.append({'series': sequences[i].id, 'image_id': imgModels[j].id, 'y_train': y_train, 'image_name': imgModels[j].name, 'invert': False})
                    data.append({'series': sequences[i].id, 'image_id': imgModels[j].id, 'y_train': y_train_inv, 'image_name': imgModels[j].name, 'invert': True})
        shuffle(data)
        return data
                            

    def prepare_image(self,_id, invert):
        img = Retina_Image.objects.get(id=_id)
        img = PIL.Image.open(img.image).convert('L')
        if invert:
            img = img.transpose(PIL.Image.FLIP_LEFT_RIGHT)
        arr_img = self.preprocess_image(img)
        return arr_img 

    def standardization(self,image):
        return (image - np.mean(image)) / np.std(image)
    
    def normalize(self,image):
        min_val = np.amin(image)
        max_val = np.amax(image)

        image = image - np.amin(image)
        image = image / (max_val - min_val)

        return image

    def preprocess_image(self,image):
        image = np.array(image)
        image = self.standardization(image)
        image = resize(image, (self.img_size_1, self.img_size_2, 1))
        image = self.normalize(image)
        return image

    def create_dataset_and_store(self,hdf5_file, name, metadata):
        x_name = name+'_x'
        x_shape = (len(metadata), self.img_size_1, self.img_size_2, 1)
        hdf5_file.create_dataset(x_name, x_shape, np.float32)
        
        y_name = name+'_y'
        y_shape = (len(metadata),2)
        hdf5_file.create_dataset(y_name, y_shape, np.int8)
        
        meta_name = name+'_metadata'
        meta_shape = (len(metadata),)
        hdf5_file.create_group(meta_name)
        for i in range(len(metadata)):
            meta_elem = metadata[i]

            #save X data
            prepared_img = self.get_image(meta_elem)
            hdf5_file[x_name][i] = prepared_img

            #save Y data
            hdf5_file[y_name][i] = meta_elem['y_train']
            
            #save metadata
            gr = hdf5_file.create_group(meta_name+'/'+str(i))
            for k, v in meta_elem.items():
                    gr[k] = v
                    
                    
    def get_image(self,meta_elem):
        id_img = meta_elem['image_id']
        invert = meta_elem['invert']
        return self.prepare_image(id_img, invert)
    
                
    def store_all_data_in_h5py_file(self):
        
        f = open('splited_data.txt')
        train_samples, val_samples, test_samples = self.samples_from_file(f)

        train_metadata = self.get_images_metadata(train_samples)
        val_metadata = self.get_images_metadata(val_samples)
        test_metadata = self.get_images_metadata(test_samples)

        hdf5_path = './lr-data-size100.hdf5'
        hdf5_file = h5py.File(hdf5_path, mode='w')
        
        try: 
            self.create_dataset_and_store(hdf5_file,'train_data',train_metadata)
            self.create_dataset_and_store(hdf5_file,'val_data',val_metadata)
            self.create_dataset_and_store(hdf5_file,'test_data',test_metadata)
            hdf5_file.close()
            print('success')

        except:
            print('fail')
            hdf5_file.close()
            os.remove(hdf5_path)
            raise
        


In [None]:
data_preparer = Data_preparer()
data_preparer.store_all_data_in_h5py_file()