# Definición del problema

Se desea entrenar un modelo que sea capaz de 'completar' una palabra a medio escribir, o proponer una corrección para una palabra ya escrita en caso de que la misma se encuentre mal escrita.

Se utilizará un algoritmo de 'hallar la palabra incorrecta' para determinar si una palabra está escrita incorrectamente, de acuerdo a un lexicón construido con palabras extraídas de la página web de la RAE (disponible en https://github.com/JorgeDuenasLerin/diccionario-espanol-txt, actualizado en Mayo 2024)

Además, se construirá una matriz de probabilidad con las palabras extraídas para que las recomendaciones de completado y corrección se realicen en función de la frecuencia de utilización de las palabras. El sistema será capaz de realizar estas funciones en Español.

Se utilizarán textos para entrenarlo.

# Estructura del modelo

# Importacion de Librerias

In [1]:
import numpy as np

import keras as kr
from keras.models import Sequential
from keras.layers import Dense, LSTM

# Datos de entrada

In [3]:
# Extract text from Spanish book converted into txt format
txt_file = "text_dump.txt"

lines: list[str] = []
with open(txt_file, 'r', encoding="UTF-8") as file:
  for line in file:
    if line != "\n": #Do not include empty lines in text digest
      lines.append(line)

#As of this point, 'lines' variable should hold the full length of the txt file
print("Total line count:", len(lines))
print("Total character count:", sum([len(item) for item in lines]))

Total line count: 14377
Total character count: 2308761


In [4]:
# Extract used vocabulary and construct encode/decode dictionaries
full_txt_str: str = ""
vocab: list[str] = []
acc = 0 #Used to print partial progress of the operation
big_acc = 1 #Same as above
for paragraph in lines[31:14346]: #Do not consider index, acknowledgements, appendix, etc etc (i.e. only consider main story block for training)
  full_txt_str += paragraph
  for char in paragraph:
    if char not in vocab:
      vocab.append(char)
  #Print partial progress
  acc += 1
  if (acc >= (14346-31)*0.2):
    print(f"Vocabulary {20*big_acc}% built")
    acc = 0
    big_acc += 1
vocab.sort()
vocab_size = len(vocab)

encode_keys:dict[str,int] = {}
decode_keys: dict[int, str] = {}
for index, char in enumerate(vocab):
  encode_keys[char] = index
  decode_keys[index] = char

#As of this point, the full vocabulary should be indexed
print("Total unique characters:", vocab_size)
print("Total encode/decode dictionary keys:", len(encode_keys.keys()),"|", len(decode_keys.keys()))
print("Key list\n",encode_keys.keys())

Vocabulary 20% built
Vocabulary 40% built
Vocabulary 60% built
Vocabulary 80% built
Vocabulary 100% built
Total unique characters: 97
Total encode/decode dictionary keys: 97 | 97
Key list
 dict_keys(['\n', ' ', '!', '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '»', '¿', 'Á', 'É', 'Í', 'Ñ', 'Ó', 'Ú', 'á', 'é', 'í', 'ñ', 'ó', 'ö', 'ú', 'ü', '—', '’', '“', '”', '…'])


In [6]:
# Construct input/output data for LSTM network
seq_length = 5
input_seqs: list[list[float]] = []
output_seqs: list[float] = []
acc = 0 #Used to print partial progress of the operation
big_acc = 1 #Same as above

current:list[float] = []
for char in full_txt_str:
  if (len(current) >= seq_length):
    input_seqs.append(current.copy())
    output_seqs.append(encode_keys[char])
    current = current[1:] #Remove first element
  current.append(encode_keys[char])
  #Print partial progress
  acc += 1
  if (acc >= len(full_txt_str)*0.2):
    print(f"IO data {20*big_acc}% built")
    acc = 0
    big_acc += 1

#Normalize input sequences
normal_input_seqs: list[list[float]] = []
for row in input_seqs:
  normal_input_seqs.append([item/(vocab_size - 1) for item in row])
print("Data normalization done")

#Construct arrays from data used
train_data: np.ndarray = np.array([np.array(row) for row in normal_input_seqs])
train_tags: np.ndarray = np.array([np.zeros(vocab_size) for item in output_seqs])
for index, value in enumerate(output_seqs):
  train_tags[index][value] = 1.0

#As of this point, the full set of training data has been built
print("Train shape:", train_data.shape)
print("Tags shape:", train_tags.shape)
print("First 20 steps")
acc = 0
for data,tag in zip(train_data, train_tags):
  print(acc+1)
  print(data)
  print(tag)
  acc += 1
  if (acc >= 20):
    break

IO data 20% built
IO data 40% built
IO data 60% built
IO data 80% built
Data normalization done
Train shape: (2294626, 5)
Tags shape: (2294626, 97)
First 20 steps
1
[0.33333333 0.5        0.61458333 0.5        0.60416667]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
2
[0.5        0.61458333 0.5        0.60416667 0.01041667]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
3
[0.61458333 0.5        0.60416667 0.01041667 0.67708333]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# Construcción del modelo

In [13]:
vect_model = Sequential()
vect_model.add(kr.Input(shape=(seq_length,1)))
vect_model.add(LSTM(10)) #Add dropout?
vect_model.add(Dense(vocab_size, activation="softmax"))

vect_model.compile(loss="categorical_crossentropy",optimizer="rmsprop", metrics=["accuracy"])

# Entrenamiento del modelo

In [14]:
#data_size = (train_data.shape)[0]
vect_model.fit(train_data, train_tags, batch_size = 1024,epochs = 10)

Epoch 1/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.1415 - loss: 3.3605
Epoch 2/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.1733 - loss: 2.9899
Epoch 3/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.1837 - loss: 2.8962
Epoch 4/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.1873 - loss: 2.8587
Epoch 5/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.1933 - loss: 2.8347
Epoch 6/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.1966 - loss: 2.8183
Epoch 7/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.1985 - loss: 2.8057
Epoch 8/10
[1m2241/2241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.1988 - loss: 2.7949
Epoch 9/10
[1m22

<keras.src.callbacks.history.History at 0x7f06ce867370>

# Evaluación

In [26]:
seed = "l"
result = seed
seed_data = [encode_keys[char]/(vocab_size - 1) for char in seed]
for i in range(20):
  predict_data = np.reshape(np.array(seed_data), (1, len(seed_data), 1))
  prediction = vect_model.predict(predict_data)

  index = np.argmax(prediction)
  next_char = decode_keys[index]
  result += next_char
  print("NEXT",index,next_char,sep="|")

  seed_data.append(encode_keys[char]/(vocab_size - 1))
  seed_data = seed_data[1:] #Remove first element to move window along sequence

  if ( (next_char == " ")or(next_char == "\n") ):
    break

print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
NEXT|1| 
l 
