In [None]:
!pip install autokeras

In [None]:
import tensorflow as tf
import autokeras as ak
import keras_tuner as kt
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend


# **대규모 데이터셋 배치 단위로 불러오기**

## **이미지 데이터셋(minst)**

In [None]:
!!wget https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/mnist.tar.gz
!!tar xzf mnist.tar.gz

In [None]:
import os
import autokeras as ak

batch_size = 32
img_height = 28
img_width = 28

#이미지가 들어있는 디렉토리
parent_dir = "data"

#image_dataset_from_directory 함수는 데이터를 담은
#tf.data.Dataset 객체를 반환한다.
test_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, "test"),
    seed=42,
    color_mode="grayscale", #흑백이라 1차원의 grayscale
    image_size=(img_height, img_width),
    batch_size=batch_size)

for images, labels in test_data.take(1):
    print(images.shape, images.dtype)
    print(labels.shape, labels.dtype)

Found 10000 files belonging to 10 classes.
(32, 28, 28, 1) <dtype: 'float32'>
(32,) <dtype: 'string'>


In [None]:
'''
keras의 keras.utils.image_dataset_from_directory과 같은듯
subset: One of "training" or "validation".
       Only used if validation_split is set.
'''
train_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, "train"),
    validation_split=0.2,
    subset="training",
    seed=123,  # 시드는 validation과 같게 유지
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size)

validation_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, "train"),
    validation_split=0.2,
    subset="validation",
    seed=123,
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size)

Found 60000 files belonging to 10 classes.
Using 48000 files for training.
Found 60000 files belonging to 10 classes.
Using 12000 files for validation.


In [None]:
import tensorflow as tf

#한 배치 데이터에 대해 학습이나 추론을 하는 동시에
#다음 배치 데이터를 메모리에 미리 불러오는 prefetch 활성화
train_data = train_data.prefetch(tf.data.AUTOTUNE) # 5
validation_data = validation_data.prefetch(tf.data.AUTOTUNE) # 5
test_data = test_data.prefetch(tf.data.AUTOTUNE)

In [None]:
clf = ak.ImageClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=1, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 02m 44s]
val_loss: 0.06728985905647278

Best val_loss So Far: 0.06728985905647278
Total elapsed time: 00h 02m 44s
[0.051990289241075516, 0.982200026512146]


## **텍스트 데이터셋(imdb)**

In [None]:
!!wget https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/imdb.tar.gz
!!tar xzf imdb.tar.gz

[]

In [None]:
import os
import autokeras as ak
import tensorflow as tf

train_data = ak.text_dataset_from_directory(
    "imdb/train",
    validation_split=0.2,
    subset="training",
    seed=123,
    max_length=1000,
    batch_size=32,
).prefetch(1000)

validation_data = ak.text_dataset_from_directory(
    "imdb/train",
    validation_split=0.2,
    subset="validation",
    seed=123,
    max_length=1000,
    batch_size=32,
).prefetch(1000)

test_data = ak.text_dataset_from_directory(
    "imdb/test",
    max_length=1000,
).prefetch(1000)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
clf = ak.TextClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=2, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 04m 39s]
val_loss: 0.344009131193161

Best val_loss So Far: 0.344009131193161
Total elapsed time: 00h 04m 39s
Epoch 1/2
Epoch 2/2
[0.34922268986701965, 0.8521199822425842]


## **범용 사용법**

In [None]:
!!wget https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/imdb.tar.gz
!!tar xzf imdb.tar.gz

[]

In [None]:
import numpy as np
import os
import autokeras as ak
import tensorflow as tf

parent_dir = "imdb"

def load_data(path):
    data = []
    for class_label in ["pos", "neg"]:
        for file_name in os.listdir(
            os.path.join(path, class_label)):
            data.append((os.path.join(path, class_label, file_name), class_label))
    data = np.array(data)
    np.random.shuffle(data)
    return data

def get_generator(data):
    def data_generator():
        for file_path, class_label in data:
            text_file = open(file_path, "r")
            text = text_file.read()
            text_file.close()
            yield text, class_label

    return data_generator

all_train_np = load_data(os.path.join(parent_dir, "train"))

def np_to_dataset(data_np):
    return (
        tf.data.Dataset.from_generator(
            get_generator(data_np),
            output_types=tf.string,
            output_shapes=tf.TensorShape([2]),
        )
        .map(lambda x: (x[0], x[1]))
        .batch(32)
        .prefetch(5)
    )

train_data = np_to_dataset(all_train_np[:20000])
validation_data = np_to_dataset(all_train_np[20000:])
test_np = load_data(os.path.join(parent_dir, "test"))
test_data = np_to_dataset(test_np)

for texts, labels in train_data.take(1):
    print(texts.shape)
    print(labels.shape)

Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead


(32,)
(32,)


In [None]:
clf = ak.TextClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=2, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 03m 58s]
val_loss: 0.2792929410934448

Best val_loss So Far: 0.2792929410934448
Total elapsed time: 00h 03m 58s
Epoch 1/2
Epoch 2/2
[0.281268447637558, 0.8878800272941589]


# **병렬화(다중 GPU)**

병렬화는 3가지 범주로 나눌 수 있다.
1. 데이터 병렬화
2. 모델 병렬화
3. 튜닝 병렬화


In [None]:
# autokeras의 데이터 병렬화
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import autokeras as ak

(x_train, y_train), (x_test, y_test) = mnist.load_data()
clf = ak.ImageClassifier(overwrite=True,
                         max_trials=1,
                         distribution_strategy=tf.distribute.MirroredStrategy())
clf.fit(x_train, y_train, epochs=1)

Trial 1 Complete [00h 02m 26s]
val_loss: 0.06569601595401764

Best val_loss So Far: 0.06569601595401764
Total elapsed time: 00h 02m 26s


<keras.src.callbacks.History at 0x7bdbb24e6800>

In [None]:
# keras tuner의 데이터 병렬화
import keras_tuner as kt

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten())
    model.add(
        tf.keras.layers.Dense(
            units=hp.Int("units", min_value=32, max_value=512, step=32),
            activation="relu",
        )
    )
    model.add(tf.keras.layers.Dense(10, activation="softmax"))
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
    return model

tuner = kt.RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=1,
    directory="my_dir",
    #여기서 distribution strategy를 설정하면 된다.
    distribution_strategy=tf.distribute.MirroredStrategy(),
    project_name="dist_helloworld",
)

tuner.search(x_train, y_train, epochs=1, validation_data=(x_test, y_test))


# **Transfer learning(전이 학습 / Warm Start)**

In [None]:
import tensorflow as tf
import autokeras as ak

#사전 학습된 ResNet(pretrained = True) 사용
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
input_node = ak.ImageInput()
output_node = ak.Normalization()(input_node)
output_node = ak.ImageAugmentation()(output_node)
output_node = ak.ResNetBlock(pretrained=True)(output_node)
output_node = ak.ClassificationHead()(output_node)
model = ak.AutoModel(
    inputs=input_node, outputs=output_node, max_trials=2, overwrite=True
)
model.fit(x_train[:100], y_train[:100], epochs=1)
model.evaluate(x_test, y_test)

[3.3877246379852295, 0.1005999967455864]

In [None]:
import tensorflow as tf
import keras_tuner as kt


def build_model(hp):
    if hp.Boolean("pretrained"):
        weights = "imagenet"
    else:
        weights = None
    resnet = tf.keras.applications.ResNet50(include_top=False, weights=weights)
    if hp.Boolean("freeze"):
        resnet.trainable = False

    input_node = tf.keras.Input(shape=(32, 32, 3))
    output_node = resnet(input_node)
    output_node = tf.keras.layers.Dense(10, activation="softmax")(output_node)
    model = tf.keras.Model(inputs=input_node, outputs=output_node)
    model.compile(loss="sparse_categorical_crossentropy")
    return model


(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

tuner = kt.RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=4,
    overwrite=True,
    directory="result_dir",
    project_name="pretrained",
)

tuner.search(
    x_train[:100], y_train[:100], epochs=1, validation_data=(x_test[:100], y_test[:100])
)

Trial 4 Complete [00h 00m 09s]
val_loss: 4.698379039764404

Best val_loss So Far: 4.698379039764404
Total elapsed time: 00h 01m 03s
