In [1]:
# If running with Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dataFolder = '/content/drive/MyDrive/Colab Notebooks/LM'
import os
os.chdir(dataFolder)

In [3]:
from pickle import load
%tensorflow_version 1.x
import tensorflow as tf
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.backend.tensorflow_backend import set_session
import numpy as np
import re

TensorFlow 1.x selected.


Using TensorFlow backend.


In [4]:
## for tensor 1
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
sess = tf.Session(config=config)
set_session(sess)
print("use-gpu:", tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

use-gpu: /device:GPU:0
Num GPUs Available:  1


In [5]:
SEQ_LENGTH = 16
CORRECT_THRESHOLD = 0.001

In [6]:
def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # INTAB = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    newString = re.sub("[^a-zA-ZạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ]", " ", newString)
    # print('newString:',newString)
    long_words=[]
    # remove short word
    for i in newString.split():
      if len(i)>=1:
        long_words.append(i)
    return (" ".join(long_words)).strip()

def encode_string(mapping, seq_length, in_text):
	# encode the characters as integers
	encoded = [mapping[char] for char in in_text]

	# truncate sequences to a fixed length
	encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
	return encoded

def decode_string(mapping, in_text):
	out_text = ""
	for i in range(len(in_text)):
		for char, index in mapping.items():
			if index == in_text[i]:
				out_text += char
				break
	return out_text

def insert(source_str, insert_str, pos):
    return source_str[:pos]+insert_str+source_str[pos:]

def replace(source_str, insert_str, start_pos):
	source_list = list(source_str)
	if (start_pos > len(source_list)):
		return source_str
	for i in range(len(insert_str)):
		source_list[start_pos + i] = insert_str[i]
	return ''.join(source_list)

In [7]:
# load the model
model = load_model('model1.h5')

# load the mapping
mapping = load(open('name_data_mapping.pkl', 'rb'))

# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

print("Number of left layers: %d" % len(model.layers))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Vocabulary Size: 97
Number of left layers: 4


In [8]:
idx_char_mapping = dict([(value, key) for key, value in mapping.items()]) 

In [9]:
def next_char_predict(model, mapping, seq_length, seed_text, num=5):
  in_text = text_cleaner(seed_text)
  # print('===', in_text)
  out_text = in_text[:]
  # print(';;;;;;;;', out_text)
  i = len(in_text)

  out_text_predict_encode = encode_string(mapping, seq_length, out_text)
  # print('1:', out_text_predict_encode)
  proba_list_char = model.predict_proba(out_text_predict_encode)
  index_char_list = np.argsort(proba_list_char[0])[-5:][::-1]
  probability = proba_list_char[0][index_char_list]
  result = [idx_char_mapping[idx] for idx in index_char_list]
  # print('res1:', result)
  result = list(zip(result,list(probability)))
  return result

In [10]:
while True:
    inputText = input('Input: ')
    output = inputText
    if (inputText == 'Exit'): break
    recs = next_char_predict(model, mapping, SEQ_LENGTH, inputText, num=5)
    for idx, res in enumerate(recs):
      print(f"Output {idx+1} :", output + res[0],"\t Probability:", res[1])
    print('\n')

Input: chiều nay khôn
Output 1 : chiều nay không 	 Probability: 0.9999974
Output 2 : chiều nay khôn  	 Probability: 2.474856e-06
Output 3 : chiều nay khônh 	 Probability: 1.0089005e-07
Output 4 : chiều nay khônở 	 Probability: 3.1766674e-09
Output 5 : chiều nay khônc 	 Probability: 4.1441098e-10


Input: một chiều th
Output 1 : một chiều thu 	 Probability: 0.30719784
Output 2 : một chiều thứ 	 Probability: 0.11248338
Output 3 : một chiều tha 	 Probability: 0.08818023
Output 4 : một chiều thư 	 Probability: 0.06692345
Output 5 : một chiều the 	 Probability: 0.05865804


Input: chiều chủ nhậ
Output 1 : chiều chủ nhật 	 Probability: 0.9629889
Output 2 : chiều chủ nhận 	 Probability: 0.027300702
Output 3 : chiều chủ nhậm 	 Probability: 0.0095649995
Output 4 : chiều chủ nhập 	 Probability: 9.889818e-05
Output 5 : chiều chủ nhậu 	 Probability: 4.2497508e-05


Input: hoa rơi cửa ph
Output 1 : hoa rơi cửa phủ 	 Probability: 0.38630292
Output 2 : hoa rơi cửa phư 	 Probability: 0.13650617
Output

In [None]:
def correct_one_mistake_multi_choice_with_proba(model, mapping, input_text, num=5):

	"""
	"""
	in_text = text_cleaner(input_text)
  # out_text = in_text[0:5]
	out_text_predict = in_text[0:5]
	i = 5
	results = ""
	proba_results = []
	proba_correct = 1.0
	while True:
		out_text_predict_encode = encode_string(mapping , SEQ_LENGTH, out_text_predict)
		proba_list_char = model.predict_proba(out_text_predict_encode)
		next_char = model.predict_classes(out_text_predict_encode)
		
		if ((i+1 <= len(in_text)-1) and int(next_char) != mapping[in_text[i]]):
			if (proba_list_char[0][mapping[in_text[i]]] > CORRECT_THRESHOLD):
				# out_text += in_text[i]
				pass
			else:
				if (proba_list_char[0][mapping[in_text[i]]] < proba_correct): 
					proba_correct = proba_list_char[0][mapping[in_text[i]]]
					index_char_list = np.argpartition(proba_list_char, -num)[0,-num:]
					index_char_list = index_char_list[np.argsort(proba_list_char[0][index_char_list])][::-1]
					
					proba_results = proba_list_char[0, index_char_list]

					results = [out_text_predict + idx_char_mapping[idx] + in_text[i+1:] for idx in index_char_list]

					# if(decode_string(mapping, next_char) == ' ' and in_text[i] != ' '):
					# 	temp = in_text[i:]
					# 	in_text = in_text[0:i]
					# 	in_text = in_text + ' ' + temp
		else:
			# out_text+=in_text[i]
			pass
    
		if(i < len(in_text)-1):
			out_text_predict += in_text[i]
			i = i + 1
		else:
			break
	return results, proba_results

In [None]:
while True:
    inputText = input('Input: ')
    if (inputText == 'Exit'): break
    results, c = correct_one_mistake_multi_choice_with_proba(model, mapping, inputText)
    for idx, out in enumerate(results):
      print(f"output {idx+1}: {out} -- with probality: {c[idx]}")

Input: trên cánh cây chim hòt
output 1: trên cánh cây chim hót -- with probality: 0.5316782593727112
output 2: trên cánh cây chim hút -- with probality: 0.15933409333229065
output 3: trên cánh cây chim hảt -- with probality: 0.06863003969192505
output 4: trên cánh cây chim hát -- with probality: 0.06276942789554596
output 5: trên cánh cây chim hàt -- with probality: 0.03417752683162689
Input: chiếu cà mau cằm
output 1: chiếu cà mau chm -- with probality: 0.355904221534729
output 2: chiếu cà mau cúm -- with probality: 0.11574748158454895
output 3: chiếu cà mau cóm -- with probality: 0.10173878818750381
output 4: chiếu cà mau cám -- with probality: 0.06571568548679352
output 5: chiếu cà mau cưm -- with probality: 0.04660026356577873
Input: chiếc chiếu mời
output 1: chiếc chiến mời -- with probality: 0.9097655415534973
output 2: chiếc chiếc mời -- with probality: 0.08846404403448105
output 3: chiếc chiếm mời -- with probality: 0.0013353683752939105
output 4: chiếc chiếu mời -- with probal