In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku 
import tensorflow as tf
import string
import os
import re
import numpy as np

2025-10-13 05:07:38.865905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760332059.315391      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760332059.472917      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
data_path = '/kaggle/input/10000-vietnamese-books/output'

# **1. Data Preprocessing**

In [3]:
def preprocess(text):
    #Lower text
    text = text.lower() 
    #Remove url 
    text = re.sub(r"https?://\S+", '', text)
    #Remove punctuation
    text = re.sub(r"[^a-zA-ZÀ-ỹ\s]", " ", text)
    #Strip extra space 
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
#Build corpus with 10 files
corpus = []
max_files = 10
for i, file_name in enumerate(os.listdir(data_path)):
    if i >= max_files:
        break

    file_path = os.path.join(data_path, file_name)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:
            sentences = line.split(".")
            for sentence in sentences:
                cleaned = preprocess(sentence)
                if cleaned:                        
                    corpus.append(cleaned)

print("Corpus built successfully!")
print("Number of sentences:", len(corpus))
print("Example:", corpus[:5])

Corpus built successfully!
Number of sentences: 11393
Example: ['mạc can', 'nhà ảo thuật', 'có một cậu bé muốn học vài trò ảo thuật nhưng không tìm đâu ra trường và thầy dạy', 'một hôm rỗi rảnh người cha chở cậu con trai nhỏ xíu khoảng tuổi ngồi háo hức sau yên xe honda', 'vòng quanh đường phố một lúc đang bình thường thì tự nhiên chiếc xe khục khịch ho khan dừng lại đầu hẻm như là có hẹn trước ngay chóc tiệm thuốc tây và cái tủ bán thuốc lá']


In [5]:
corpus = list(filter(None, corpus))
text = " ".join(corpus)
#Tokenization 
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print("Vocabulary size:", total_words)

Vocabulary size: 7064


# **2. LSTM for Text Generation**

In [6]:
#Input senquence for next word prediction 
input_sequence = []
for line in corpus: 
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

print("Total sequences:", len(input_sequence))

Total sequences: 291487


In [7]:
#Pad sequence 
max_sequence_len = max([len(x) for x in input_sequence])
input_sequence = np.array(pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre'))

X = input_sequence[:, :-1]
y = input_sequence[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print("Max sequence length:", max_sequence_len)
print("X shape:", X.shape)
print("y shape:", y.shape)

Max sequence length: 321
X shape: (291487, 320)
y shape: (291487, 7064)


In [8]:
#Build LSTM model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=128),
    LSTM(150),
    Dropout(0.2),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_sequence_len - 1))
model.summary()

I0000 00:00:1760332192.976708      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1760332192.977389      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [9]:
history = model.fit(X, y, epochs=30, batch_size=128, verbose=1)

Epoch 1/30


I0000 00:00:1760332231.696009      99 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 31ms/step - accuracy: 0.0270 - loss: 6.7785
Epoch 2/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.1199 - loss: 5.7117
Epoch 3/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.1683 - loss: 5.2168
Epoch 4/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.1894 - loss: 4.9321
Epoch 5/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.2043 - loss: 4.7359
Epoch 6/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.2156 - loss: 4.5793
Epoch 7/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.2269 - loss: 4.4448
Epoch 8/30
[1m2278/2278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 34ms/step - accuracy: 0.2359 - loss: 4.3417
Epoch 9/30
[1m2278

In [17]:
#Generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text

print("\nGenerated Text Example:")
print(generate_text("tôi là", 30, model, max_sequence_len))


Generated Text Example:
tôi là một người hùng theo lối sống trong khi đó có một người dân chúng ở đây có thể có thể dùng những người khác nhau để có thể tự thân
