In [11]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

In [3]:
# 데이터셋은 http://oreil.ly/laNUt 에서 직접 다운로드 받았습니다.
with open('./full_format_recipes.json') as json_data:
    recipe_data = json.load(json_data)
    
filtered_data = [
    'Recipe for ' + x['title'] + " | " + ''.join(x['directions']) for x in recipe_data
    if 'title' in x and x['title'] is not None and 'directions' in x and x['directions'] is not None
]

In [4]:
n_recipes = len(filtered_data)
print('{}개 레시피 로드'.format(n_recipes))

example = filtered_data[19]
print(example)

20111개 레시피 로드
Recipe for Tuna, Asparagus, and New Potato Salad with Chive Vinaigrette and Fried Capers  | Puree first 5 ingredients in blender until smooth. With machine running, gradually add vegetable oil, then olive oil. Season to taste with salt and pepper. DO AHEAD: Can be made 1 day ahead. Cover and chill.Cook asparagus in large skillet of boiling salted water until just tender, 4 to 5 minutes. Transfer asparagus to 13x9x2-inch pan of ice water to cool. Drain asparagus and pat dry. DO AHEAD: Can be made 8 hours ahead. Wrap in paper towels, then plastic, and chill.Place potatoes in large saucepan. Add enough water to cover potatoes by 1 inch. Sprinkle with salt. Bring to boil and cook until potatoes are tender, 10 to 15 minutes, depending on size of potatoes. Drain; let cool 5 minutes. Place in medium bowl. Add 1/4 cup vinaigrette; toss to coat. Season to taste with salt and pepper.Heat olive oil in small skillet over medium-high heat. Add capers and fry until capers are crisp and

### 토큰화(tokenization)

토큰화는 텍스트를 단어나 문자와 같은 개별 단위로 나누는 작업.  
텍스트 토큰화는 다양한 방법이 존재하고 방법에 따라 이후 모델의 출력에 많은 영향을 미칩니다.

In [5]:
def pad_punctuation(s):
    s = re.sub(f'([{string.punctuation}])', r' \1', s)
    s = re.sub(' +', ' ', s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]
text_ds = tf.data.Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)

vectorize_layer = layers.TextVectorization(
    standardize = 'lower',
    max_tokens = 10000,
    output_mode = 'int',
    output_sequence_length = 200 + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

2024-07-23 19:47:18.065800: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-23 19:47:18.177051: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-23 19:47:18.177083: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-23 19:47:18.179398: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-23 19:47:18.179431: I tensorflow/compile

In [6]:
for i, word in enumerate(vocab[:10]):
    print('{}: {}'.format(i, word))

0: 
1: [UNK]
2: ,
3: .
4: and
5: to
6: in
7: the
8: with
9: a


In [8]:
example_data = text_data[9]
example_tokenized = vectorize_layer(example_data)
print(example_tokenized.numpy())

2024-07-19 20:13:54.905456: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


[  24   14  571    1    8  306  340  186    4 1103  508   25  342  224
  233  256    5  615   11  129   20  357    3  342   40  256    4  694
    4   66    8  166    4   74    6    9   60   72    2  110    2   56
   12  333    2   85  674   18   33    6    9   28   19    4   63  535
   11  158    3  350  166   95    9  373  679  303   22   19    8  674
    4   38    5  988    3   61    8   21    4   30    3  107   19    6
  173  176 1324    4   55    5  134  105    2   44  721  571    8  278
  233    4  253  285 1039    3  114  674   26   68    4  102   10  107
    2   50  198   11  168  433  103  489    2  306    2  188    2   11
   67  138   21    2    4   11   67  138   30    6    9   27   19    3
   38    6  353    2 3758    2    4  145 1519  508    8 1347    2   31
    2   11   53  138   30    2    4   11   67  138   21    6    9  286
  184    5    9  418  608 3298  508    2   41  340  186    2   18  571
    3    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [7]:
# 훈련 세트 만들기
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

### LSTM 만들기

모델의 입력 = 정수 토큰의 시퀀스,  
모델의 출력 = 10,000개 단어의 어휘 사전에서 시퀀스 다음에 나올 단어의 확률

In [8]:
# 파라미터
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_SIZE = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

In [19]:
inputs = layers.Input(shape=(None,), dtype='int32')
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         117248    
                                                                 
 dense_1 (Dense)             (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile('adam', loss_fn)

In [21]:
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }
    
    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs
    
    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append({'prompt': start_prompt, 'word_probs': probs})
            start_tokens.append(sample_token)
            start_prompt = start_prompt + ' ' + self.index_to_word[sample_token]
        print(f"\n생성된 텍스트:\n{start_prompt}\n")
        return info
    
    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [22]:
text_generator = TextGenerator(vocab)

In [23]:
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

In [24]:
lstm.fit(train_ds, epochs=EPOCHS, callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator])

Epoch 1/25
생성된 텍트스:
recipe for ice warm and heavy room )prepare skillet | whisk blend , pound and sugar until taste from 

Epoch 2/25
생성된 텍트스:
recipe for roasted wild eggs | together bourbon , , cilantro , and lemon juice in a fork crust until dissolved . caraway cooking the broth from onions ; sprinkle with salt , egg and snow chop green mixture until it mixture snap through 12 cups to pleating through and cool gently while holds heated through and serving to a bowl with a slotted spoon , stir , flour just mushrooms and simmer gently turn while rice comes out ) . when just sambal will -iron the ) into the bowl and cook until )preheat to the (it are desired

Epoch 3/25
생성된 텍트스:
recipe for summer sauce teriyaki warm streusel | in a bowl with both sides of the pie ring with parchment of the best cream into the freeze . chill sheet rest for about 2 minutes .place the slice nuts in a rimmed baking flat . garnish with an electric mixer tightly until fluffy stick cream , if desired . halve l

<keras.src.callbacks.History at 0x7fd7e8fb2990>

In [25]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\n프롬프트: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [26]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


생성된 텍트스:
recipe for roasted vegetables | chop 1 / 3 stalk



In [27]:
print_probs(info, vocab)


프롬프트: recipe for roasted vegetables | chop 1 /
3:   	19.66%
2:   	15.81%
1:   	11.09%
4:   	10.61%
5:   	5.07%
--------


프롬프트: recipe for roasted vegetables | chop 1 / 3
/4:   	54.28%
-inch:   	10.58%
inches:   	6.09%
-:   	4.59%
-quart:   	4.04%
--------



In [28]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


생성된 텍트스:
recipe for roasted vegetables | chop 1 / 3 /4



In [29]:
print_probs(info, vocab)


프롬프트: recipe for roasted vegetables | chop 1 /
3:   	69.4%
2:   	23.36%
1:   	3.97%
4:   	3.17%
5:   	0.08%
--------


프롬프트: recipe for roasted vegetables | chop 1 / 3
/4:   	99.97%
-inch:   	0.03%
inches:   	0.0%
-:   	0.0%
-quart:   	0.0%
--------



In [30]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


생성된 텍트스:
recipe for chocolate ice cream | simmer


프롬프트: recipe for chocolate ice cream |
in:   	11.1%
combine:   	9.29%
bring:   	7.49%
stir:   	6.64%
put:   	5.62%
--------



In [31]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


생성된 텍트스:
recipe for chocolate ice cream | in


프롬프트: recipe for chocolate ice cream |
in:   	58.09%
combine:   	23.93%
bring:   	8.12%
stir:   	4.44%
put:   	1.93%
--------

