In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import CategoryEncoding # Import CategoryEncoding

# 1. Dữ liệu nhỏ (English → Vietnamese)
data = [
    ("hello", "xin chào"),
    ("how are you", "bạn khỏe không"),
    ("thank you", "cảm ơn bạn"),
    ("good night", "chúc bạn ngủ ngon"),
    ("see you later", "hẹn gặp lại"),
]

# 2. Tách thành 2 list
eng_texts, vi_texts = zip(*data)

# 3. Tokenize mỗi ngôn ngữ
tokenizer_eng = Tokenizer(oov_token="<UNK>")
tokenizer_eng.fit_on_texts(eng_texts)
tokenizer_vi = Tokenizer(oov_token="<UNK>")
tokenizer_vi.fit_on_texts(vi_texts)

seqs_eng = tokenizer_eng.texts_to_sequences(eng_texts)
seqs_vi  = tokenizer_vi.texts_to_sequences(vi_texts)

max_len_eng = max(len(s) for s in seqs_eng)
max_len_vi  = max(len(s) for s in seqs_vi)

seqs_eng = pad_sequences(seqs_eng, maxlen=max_len_eng, padding='post')
seqs_vi  = pad_sequences(seqs_vi,  maxlen=max_len_vi,  padding='post')

# 4. Xây mô hình Encoder-Decoder
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_vi  = len(tokenizer_vi.word_index)  + 1

latent_dim = 64

# Encoder
encoder_inputs = Input(shape=(None,), dtype='int64') # Specify dtype
enc_emb = CategoryEncoding(num_tokens=vocab_size_eng, output_mode="one_hot")(encoder_inputs) # Use CategoryEncoding
enc_emb = Dense(latent_dim, activation='relu')(enc_emb) # Add Dense layer after one-hot encoding
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,), dtype='int64') # Specify dtype
dec_emb = CategoryEncoding(num_tokens=vocab_size_vi, output_mode="one_hot")(decoder_inputs) # Use CategoryEncoding
dec_emb = Dense(latent_dim, activation='relu')(dec_emb) # Add Dense layer after one-hot encoding
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_vi, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# 5. Chuẩn bị dữ liệu mục tiêu (shifted decoder input)
decoder_target = np.zeros_like(seqs_vi)
decoder_target[:, :-1] = seqs_vi[:, 1:]

# 6. Huấn luyện
model.fit([seqs_eng, seqs_vi], np.expand_dims(decoder_target, -1),
          batch_size=2, epochs=300, verbose=0)

# 7. Dự đoán thử
def translate_sentence(input_sentence):
    seq = tokenizer_eng.texts_to_sequences([input_sentence])
    seq = pad_sequences(seq, maxlen=max_len_eng, padding='post')
    # tạo input decoder bắt đầu (ví dụ chỉ token bắt đầu)
    dec_seq = np.zeros((1, max_len_vi))
    # dự đoán
    preds = model.predict([seq, dec_seq])
    pred_ids = np.argmax(preds[0], axis=-1)
    words = [tokenizer_vi.index_word.get(pid, '') for pid in pred_ids]
    return ' '.join(words).strip()

print("Input: how are you")
print("Output:", translate_sentence("how are you"))

Input: how are you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
Output: khỏe không
