# Definición del problema

Se desea entrenar un modelo que sea capaz de 'completar' una frase a medio escribir mediante generación del siguiente caracter en la secuencia.

El sistema será capaz de realizar estas funciones en Español.
Será entrenado a partir de una versión reducida del Principito, donde se eliminaron los acentos y se sustituyó 'ñ' por 'n' para reducir el vocabulario.

El modelo deberá inferir el siguiente caracter a partir de estudiar la distribución de probabilidad presente en el texto.

# Estructura del modelo

Se utilizará un modelo con una capa LSTM de 256 neuronas y Dropout de 0.2; seguida de una capa densa que permita analizar el resultado y generar una distribución de probabilidad del caracter siguiente a través de activación Softmax

Será optimizada mediante entropía cruzada como función de pérdida y rmsprop como algoritmo de optimización.

# Importacion de Librerias

In [7]:
import numpy as np

import keras as kr
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

# Datos de entrada

In [8]:
# Extract text from Spanish book converted into txt format
txt_file = "sanitized_text_dump.txt"

lines: list[str] = []
with open(txt_file, 'r', encoding="UTF-8") as file:
  for line in file:
    if line != "\n": #Do not include empty lines in text digest
      lines.append(line)

#As of this point, 'lines' variable should hold the full length of the txt file
print("Total line count:", len(lines))
print("Total character count:", sum([len(item) for item in lines]))

Total line count: 1
Total character count: 74156


In [9]:
# Extract used vocabulary and construct encode/decode dictionaries
full_txt_str: str = ""
vocab: list[str] = []
acc = 0 #Used to print partial progress of the operation
big_acc = 1 #Same as above
for paragraph in lines: #Do not consider index, acknowledgements, appendix, etc etc (i.e. only consider main story block for training)
  full_txt_str += paragraph
  for char in paragraph:
    if char not in vocab:
      vocab.append(char)
  #Print partial progress
  acc += 1
  if (acc >= (14346-31)*0.2):
    print(f"Vocabulary {20*big_acc}% built")
    acc = 0
    big_acc += 1
vocab.sort()
vocab_size = len(vocab)

encode_keys:dict[str,int] = {}
decode_keys: dict[int, str] = {}
for index, char in enumerate(vocab):
  encode_keys[char] = index
  decode_keys[index] = char

#As of this point, the full vocabulary should be indexed
print("Total unique characters:", vocab_size)
print("Total encode/decode dictionary keys:", len(encode_keys.keys()),"|", len(decode_keys.keys()))
print("Key list\n",encode_keys.keys())

Total unique characters: 25
Total encode/decode dictionary keys: 25 | 25
Key list
 dict_keys([' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z'])


In [10]:
# Construct input/output data for LSTM network
seq_length = 50
input_seqs: list[list[float]] = []
output_seqs: list[float] = []
acc = 0 #Used to print partial progress of the operation
big_acc = 1 #Same as above

current:list[float] = []
for char in full_txt_str:
  if (len(current) >= seq_length):
    input_seqs.append(current.copy())
    output_seqs.append(encode_keys[char])
    current = current[1:] #Remove first element
  current.append(encode_keys[char])
  #Print partial progress
  acc += 1
  if (acc >= len(full_txt_str)*0.2):
    print(f"IO data {20*big_acc}% built")
    acc = 0
    big_acc += 1

#Normalize input sequences
normal_input_seqs: list[list[float]] = []
for row in input_seqs:
  normal_input_seqs.append([item/(vocab_size - 1) for item in row])
print("Data normalization done")

#Construct arrays from data used
train_data: np.ndarray = np.array([np.array(row) for row in normal_input_seqs])
train_tags: np.ndarray = np.array([np.zeros(vocab_size) for item in output_seqs])
for index, value in enumerate(output_seqs):
  train_tags[index][value] = 1.0

#As of this point, the full set of training data has been built
print("Train shape:", train_data.shape)
print("Tags shape:", train_tags.shape)
print("First 20 steps")
acc = 0
for data,tag in zip(train_data, train_tags):
  print(acc+1)
  print(data)
  print(tag)
  acc += 1
  if (acc >= 20):
    break

IO data 20% built
IO data 40% built
IO data 60% built
IO data 80% built
Data normalization done
Train shape: (74106, 50)
Tags shape: (74106, 25)
First 20 steps
1
[0.125      0.83333333 0.04166667 0.54166667 0.16666667 0.58333333
 0.         0.20833333 0.70833333 0.04166667 0.         0.54166667
 0.375      0.54166667 0.58333333 0.         0.         0.125
 0.83333333 0.04166667 0.54166667 0.16666667 0.58333333 0.
 0.95833333 0.58333333 0.         0.79166667 0.20833333 0.54166667
 0.375      0.04166667 0.         0.75       0.20833333 0.375
 0.75       0.         0.04166667 0.54166667 0.58333333 0.75
 0.         0.875      0.375      0.         0.20833333 0.54166667
 0.         0.83333333]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
2
[0.83333333 0.04166667 0.54166667 0.16666667 0.58333333 0.
 0.20833333 0.70833333 0.04166667 0.         0.54166667 0.375
 0.54166667 0.58333333 0.         0.         0.125      0.83333333
 0.04166667 0.54166667 0.16666667 

# Construcción del modelo

In [11]:
filename = "savestate.keras"
isNew = True

vect_model = None
if (isNew):
  vect_model = Sequential()
  vect_model.add(kr.Input(shape=(seq_length,1)))
  vect_model.add(LSTM(256)) #Add dropout?
  vect_model.add(Dropout(0.2))
  vect_model.add(Dense(vocab_size, activation="softmax"))
  vect_model.compile(loss="categorical_crossentropy",optimizer="rmsprop")
else:
  kr.models.load_model(filename)

# Entrenamiento del modelo

In [None]:
#Initialize epoch count for savestates' filenames
epoch_ct:int = 0

In [28]:
#Master flag to define if the model should be trained
doTrain: False
train_length:int = 200 #Epoch duration of the training

if (doTrain):
  predict = vect_model.fit(train_data, train_tags, batch_size = 64,epochs = train_length)
  epoch_ct += train_length
  state_filename = "sanitized_state_" + str(epoch_ct) + "_28Nov.keras"
  vect_model.save(state_filename)

Epoch 1/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.1358
Epoch 2/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 0.1366
Epoch 3/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.1354
Epoch 4/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 0.1352
Epoch 5/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.1336
Epoch 6/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 0.1375
Epoch 7/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.1413
Epoch 8/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.1362
Epoch 9/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.1364
Epoch 10/200
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# Evaluación

In [38]:
seed = "yo no debia hacerle caso me confeso un dia el principito nunca hay que hacer caso a las flores basta con mirarlas y ole"
result = seed
seed_data = [encode_keys[char]/(vocab_size - 1) for char in seed]
for i in range(20):
  predict_data = np.reshape(np.array(seed_data), (1, len(seed_data), 1))
  prediction = vect_model.predict(predict_data)

  index = np.argmax(prediction)
  next_char = decode_keys[index]
  result += next_char
  print("NEXT",index,next_char,sep="|")

  seed_data.append(encode_keys[char]/(vocab_size - 1))
  if (len(seed_data) > seq_length):
    seed_data = seed_data[1:] #Remove first element to move window along sequence

  if ( (next_char == " ")or(next_char == "\n") ):
    break

print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
NEXT|21|v
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
NEXT|9|i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
NEXT|12|m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
NEXT|21|v
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
NEXT|11|l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
NEXT|20|u
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
NEXT|19|t
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
NEXT|20|u
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
NEXT|16|q
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
NEXT|17|r
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
NEXT|17|r
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
NEXT|5|e
[1m1/1[0m [32m━