preprocessing

In [1]:
%%writefile ./preprocessing2.py

import os
import re
from PIL import Image

# mat삭제
def delete_mat(data_list):
    for i, data in enumerate(data_list):
        print(data)
        basename = os.path.basename(data)
        # os.path.basename 입력받은 경로의 기본이름을 반환 >> 경로중 파일명만 얻기
        
        _, file = basename.split(".")
        
        if file == "mat":
            del data_list[i]
    return data_list

# 4channel 삭제
def delete_4_channel(data_list):
    for i, data in enumerate(data_list):
        image_data = Image.open(data)
        mode = image_data.mode
        
        if mode != "RGB":
            del data_list[i]
    return data_list

# 라벨 인코딩
def label_encoding(data_list):
    # 방법1
    class_list = []
    for data in data_list:
        basename = os.path.basename(data) 
        label = os.path.splitext(basename)[0] # 파일명에서 확장자만 따로 분리됨
        label = re.sub("_\d+","",label)  #숫자를 공백으로?
        
        if label in class_list:
            continue
        else:
            class_list.append(label)
            
    class_to_index = {cls:i for i, cls in enumerate(class_list)} # 이름:인덱스 형태의 dict생성?
    return class_to_index
            
## label 숫자제거시 기존 이름과 중복될경우는?  car1, car2작업시    

Overwriting ./preprocessing2.py


In [2]:
%%writefile ./tf_record2.py

import os
import re
from glob import glob
from PIL import Image
import tensorflow as tf

class MakeTFRecord:
    
    IMG_SIZE = 224
    
    def __init__(self, data_list, tfr_path, data_class):
        self.data_list = data_list
        self.tfr_path = tfr_path
        self.data_class = data_class
        
    def _make_tf_writer(self):
        '''
        TF writer를 만드는 tf함수
        '''
        writer = tf.io.TFRecordWriter(self.tfr_path)
        return writer
    
    #  The following functions can be used to convert a value to a type compatible
    # with tf.Example >> 값을 호환가능한 유형으로 변환 가능
    @staticmethod
    # 클래스에서 바로 접근가능, self인자 없음
    def _bytes_feature(value):
        if isinstance(value, type(tf.constant(0))):
            # value가 상수이면?
            value = value.numpy() # bytelist won't unpack a string from an EagerTensor
        return tf.train.Feature(bytes_list=tf.train.ByteList(value=[value])) # tf.train.ByteList - string, byte값으로부터 mapping
                                
    @staticmethod
    def _float_feature(value):
        return tf.train.Feature(float_list=tf.train.Floatlist(value=[value])) # float(32), double(float64)값으로부터 mapping
    
    @staticmethod
    def _int64_feature(value):
        return tf.train.Feature(int64_list = tf.train.Int64List(value=[value]))
                                
    def _make_tfrecord(self):
        writer = self._make_tf_writer()
        n = 0
        
        for data in self.data_list:
            image = Image.open(data)
            image = image.resize((self.IMG_SIZE, self.IMG_SIZE)) # 가로(width) 세로(height)
            # tf record byte로 되어있음
            image_to_byte = image.tobyte()
            
            basename = os.path.basename(data)
            label = os.path.split(basename)[0]
            label = re.sub("_\d+", "", label)
            label_num = self.data_class[label]
            # tf.train.Example객체 생성후 인자로 features에 TFRecord에 저장될 값의 목록을 dict로 저장
            example = tf.train.Example(features=tf.train.Features(features={
                "image" : self._bytes_feature(image_to_byte),
                "label" : self._int64_feature(label_num)
            }))
                                
            writer.write(example.SerializeToString())
            # 데이터를 tf.train.Feature로 변환후 tf.Example 정의 후,SerializeToString()이용 직렬화
            n += 1
        writer.close()
        print(f"{n}개의 데이터, TFRecord완성")
                                
    @classmethod # cls라는 인자 필요
    def change_img_size(cls, image_size):
        cls.IMG_SIZE = image_size
    
    # class의 인스턴스를 함수처럼 호출가능
    def __call__(self): 
        print("tfrecord 만들기 시작")
        self._make_tfrecord()
    

Overwriting ./tf_record2.py


In [3]:
%%writefile ./dataloader2.py

import math
import tensorflow as tf

class TFRecordLoader:
    
    def __init__(self, tf_record_path, img_size, n_class, train_size_rate, batch_size):
        self.tfrecord = tfrecord_path
        self.img_size = img_szie
        self.n_class = n_class
        self.train_szie_rate = train_size_rate
        self.batch_size = batch_size
        
    ## tfrecord file을 data로 parsing
    def _parse_function(self, tfrecord_serialized):
        features = {'image' : tf.io.FixedLenFeature([], tf.string),
                   'label' : tf.io.FixedLenFeature([], tf.int64)}
        parsed_features = tf.io.parse_single_example(tfrecord_serialized, features)
        
        image = tf.io.decode_raw(parsed_features['image'], tf.uint8)
        image = tf.reshape(image, [self.img_size, self.img_size, 3])
        # iamge = tf.cast(image, tf.float32)/255.
        
        label = tf.cast(parsed_features['label'], tf.int64)
        label = tf.one_hot(label, self.n_class)

        return image, label
    
    def make_dataset(self):

        dataset = tf.data.TFRecordDataset(self.tfrecord)
        dataset = dataset.map(
            self._parse_function, 
            num_parallel_calls=tf.data.experimental.AUTOTUNE
                    )
        
        train_size = int(float(self.train_size_rate * len(list(dataset))))
        val_size = int(float((1 - self.train_size_rate) * len(list(dataset))))

        buffer_size = len(list(dataset))
        dataset = dataset.shuffle(buffer_size)

        train = dataset.take(train_size)
        train = train.batch(self.batch_size)
        train = train.repeat()
        train = train.prefetch(tf.data.experimental.AUTOTUNE)

        # 수정 필요
        # train dataset 만큼 스킵
        dataset = dataset.skip(train_size)
        # validation 크기만큼 데이터를 가져옴
        valid = dataset.take(val_size)
        # batch dataset으로 만들기
        valid = valid.batch(self.batch_size)

        steps = math.floor(buffer_size / self.batch_size)

        return train, valid, steps 

    def __call__(self):
        return self.make_dataset()

Overwriting ./dataloader2.py


In [4]:
%%writefile main2.py

import os
import argparse
from glob import glob
import tensorflow as tf
from tf_record2 import MakeTFRecord
from preprocessing2 import delete_mat, delete_4_channel, label_encoding

def preprocessing_1(data_path):
    date_path = data_path + "*"
    data_list = glob(data_path)
    
    # 전처리
    data_list = delete_mat(data_list)
    data_list = delete_4_channel(data_list)
    
    data_class = label_encoding(data_list)
    return data_list, data_class

# __name__ :  해당모듈의 이름 출력
#argparse : 사용법메시지 출력?
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode", choices=['tfr', 'train', 'test'], help="TFRecord 만들기 or 모델 학습 or 모델 테스트")
    parser.add_argument("--data_path", type=str, default="./", help="데이터가 들어있는 디렉토리 경로")
    parser.add_argument("--tfr_path", type=str, default="./", help="tfrecord가 저장될 디렉토리")
    parser.add_argument("--img_size", type=int, default=224, help="이미지 사이즈 입력")
    args = parser.parse_args()
    
    if args.mode == 'tfr':
        data_list, data_class = preprocessing_1(args.data_path)
        
        IMG_size = args.img_size
        tfrecord = MakeTFRecord(data_list = data_list,
                               tfr_path = args.tfr_path,
                               data_class = data_class)
        
        if args.img_size !=224:
            tf.record.change_img_size(args.img_size)
            
        #tf record 만들기
        tfrecord()

Overwriting main2.py


In [9]:
os.getenv('HOME')

'/home/ssac26'

In [8]:
# let's test
import os
path_1 = os.getenv('HOME')+'/workplace/coding_master/visions/images/'
!python main2.py \
--mode='tfr' \
#--data_path='../visions/images/' \
--data_path=path_1 \
--tfr_path='./tfrecord_data2.tfr'


./
Traceback (most recent call last):
  File "main2.py", line 31, in <module>
    data_list, data_class = preprocessing_1(args.data_path)
  File "main2.py", line 14, in preprocessing_1
    data_list = delete_mat(data_list)
  File "/home/ssac26/workplace/coding_master/prac/preprocessing2.py", line 13, in delete_mat
    _, file = basename.split(".")
ValueError: not enough values to unpack (expected 2, got 1)


In [6]:
pwd

'/home/ssac26/workplace/coding_master/prac'

In [11]:
os.getcwd()

'/home/ssac26/workplace/coding_master/prac'