In [1]:
# -*- coding: utf-8 -*-z
# https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/text_generation.ipynb#scrollTo=UK-hmKjYVoll
# https://colab.research.google.com/drive/11ujHkZmwOY1oj55Dad2C88h9WkIshGHs?authuser=1#scrollTo=tGMHNyz6RpBD
import tensorflow as tf
import numpy as np
import json
import os
import time 
import string
from underthesea import sent_tokenize
from tensorflow.keras.layers import Embedding, GRU, Dense, LSTM


In [2]:
string.punctuation = string.punctuation + '\“”…'

def preprocessing_data(text):
    translator = str.maketrans('', '', string.punctuation.replace(".","").replace("-", "").replace("/", "").replace(":", "")) # remove punctuation
    text = text.translate(translator)
    text = " ".join(text.split()) 
    # text = text.lower()
    text = str(text)
    return text

# Length of the vocabulary in chars
vocab_size = 159 # = len(vocab)
# The embedding dimension
embedding_dim = 128
# Number of RNN units
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        Dense(vocab_size)
    ])
    return model

with open("charvocab.txt", "r") as fp:
    vocab = json.load(fp)
print(vocab)

vocab = sorted(vocab)
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

[' ', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'Á', 'Â', 'Ô', '×', 'Ù', 'Ú', 'Ý', 'à', 'á', 'â', 'ã', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ù', 'ú', 'ý', 'ă', 'Đ', 'đ', 'ĩ', 'ũ', 'ơ', 'Ư', 'ư', 'ạ', 'Ả', 'ả', 'Ấ', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ', 'Ắ', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'ẹ', 'ẻ', 'ẽ', 'ế', 'ề', 'ể', 'ễ', 'ệ', 'ỉ', 'ị', 'ọ', 'ỏ', 'Ố', 'ố', 'ồ', 'Ổ', 'ổ', 'ỗ', 'ộ', 'ớ', 'ờ', 'Ở', 'ở', 'ỡ', 'ợ', 'ụ', 'Ủ', 'ủ', 'Ứ', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'ỳ', 'ỵ', 'ỷ', 'ỹ', '\u200b', '–', '‘', '’']


In [3]:
model1 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
# model = build_model(vocab_size=len(vocab), embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)

model1.load_weights("ckpt_100")
model1.build(tf.TensorShape([32, None]))

model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (1, None, 128)            20352     
_________________________________________________________________
lstm (LSTM)                  (1, None, 1024)           4722688   
_________________________________________________________________
dense (Dense)                (1, None, 159)            162975    
Total params: 4,906,015
Trainable params: 4,906,015
Non-trainable params: 0
_________________________________________________________________


In [15]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)
    num_generate = 500
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []
    temperature = 0.1
 
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        if idx2char[predicted_id] == ". ":
            break
        else:
            text_generated.append(idx2char[predicted_id])
    # return the first sentence
    return sent_tokenize(start_string + ''.join(text_generated))[:2]


# INPUT TEXT HERE
input = "Yamaha Exciter"
headline = generate_text(model1, start_string = input)
# print(headline[:2])
sentence = str(headline[0]) + " " +str(headline[1])
print(sentence)


Yamaha Exciter 150 theo phong cách môtô đua này có giá 76 triệu đồng cho bản ABS cá tính. Nguyên bản chiếc Honda Cross Cub 110 sẽ có mặt tại đại lý từ ngày 20/10.Tại thị trường Việt Nam BMW Motorrad Việt Nam còn phân phối phiên bản R của Latte là 790 mm trong khi các mẫu xe tay ga được bán chính hãng tại Việt Nam.
