<a href="https://colab.research.google.com/github/Blueprint-GitHub/Study_Note/blob/main/Deep_learning_with_keras/Deep_learning_with_keras_chapter_12_1_Text_Gen_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2023-11-02 11:01:06--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-11-02 11:01:14 (10.4 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
import tensorflow as tf
from tensorflow import keras

dataset = keras.utils.text_dataset_from_directory(
    directory = 'aclImdb', label_mode = None, batch_size = 256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "br />", " "))

Found 100006 files belonging to 1 classes.


In [None]:
from tensorflow.keras.layers import TextVectorization

sequence_length = 100
vocab_size = 15000
text_vectorization = TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length)

text_vectorization.adapt(dataset)

In [None]:
def prepare_lm_dataset(text_batch):
    vectorized_sequence = text_vectorization(text_batch) #텍스트 배치를 정수 시퀀스 배치로 변환
    x = vectorized_sequence[:, :-1] # 시퀀스의 마지막 단어 제외한 입력
    y = vectorized_sequence[:, 1:] # 시퀀스의 첫단어 제외한 타깃
    return x, y

lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls = 4)

In [None]:
from tensorflow.keras import layers

class PositionalEmbedding(layers.Layer): #위치 임베딩(사용자 정의 레이어)
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [None]:
""" 트랜스포머 기반 언어 생성 모델을 만든다, 트랜스포머는 소스 시퀀스를
인코더에 주입하고, 인코딩된 시퀀스와 타깃 시퀀스를 디코더에 주입해서 한 스텝
이후의 시퀀스를 예측했으나 텍스트 생성 모델에는 소스 시퀀스가 없으므로
transformer의 위치임베딩과 디코더만 가져온다."""

embed_dim = 256
latent_dim = 2048 #??
num_heads = 2

inputs = keras.Input(shape = (None,), dtype = 'int64')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
#디코더는 인코더없이 포지션임베딩(x)과 타깃시퀀스(x)가 입력으로 들어옴
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = layers.Dense(vocab_size, activation = 'softmax')(x)
model = keras.Model(inputs, outputs)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "rmsprop")

In [None]:
import numpy as np

tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))

def sample_next(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

class TextGenerator(keras.callbacks.Callback): # 사용자 정의 콜백(텍스트 생성)
    def __init__(self,
                 prompt,
                 generate_length,
                 model_input_length,
                 temperatures=(1.,),
                 print_freq=1):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            print("== Generating with temperature", temperature)
            sentence = self.prompt
            for i in range(self.generate_length):
                tokenized_sentence = text_vectorization([sentence])
                predictions = self.model(tokenized_sentence)
                next_token = sample_next(predictions[0, i, :], temperature)
                sampled_token = tokens_index[next_token]
                sentence += " " + sampled_token
            print(sentence)

prompt = "This movie"
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=sequence_length,
    temperatures=(0.2, 0.5, 0.7, 1., 1.5)) # softmax temperture 온도가 높아질수록 창의적

In [None]:
model.fit(lm_dataset, epochs=100, callbacks=[text_gen_callback])

""" 예시:
This movie movie is is a a great great movie movie that i is have so seen bad it it is is so so bad bad that that i you have can to say be that a i bad dont that think i its have not ever a seen bad the that
== Generating with temperature 0.5
This movie movie is was a so very bad bad that that i it can is be just bad plain its awful not the funny acting the was only absolutely thing pathetic that the i worst have movie ever the seen only on thing the that first the of movie the i
== Generating with temperature 0.7
This movie is is a not true one story of of the the all heart time of of my the heart most warming popular movie television ever i seeing think the yet times like if this you is should not be that able is to the see [UNK] the that story brought
== Generating with temperature 1.0
This movie is deserves definitely cause a of challenge say everything [UNK] else completely it breathtaking looks the like average monkeys cinemas instructions first is part about is nothing just of filled time with or and the generally dern not if your something average whatsoever viewer and that interesting anyone likable who
== Generating with temperature 1.5
This movie tedious fantastically little gripping establishes supernatural mary from i paz lived witness anthony on horton thrilled absorb wings pretentious sarah proverbial my farscape baby another owner [UNK] flynn movie apparently appeared danny guys amiable mutual haphazard conspiracy pun myth wanders buttons which way fists because cartoon shes network under seem
391/391 [==============================] - 57s 145ms/step - loss: 5.0615
Epoch 8/100
 36/391 [=>............................] - ETA: 42s - loss: 5.0410
 """