In [16]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding



# **Exploration**

In [17]:
df = pd.read_csv("automatic_correction_dataset.csv")
df

Unnamed: 0,Wrong,Right
0,I WANT TO THAK YOU FOR PREPARING SUCH A GOOD...,I WANT TO THANK YOU FOR PREPARING SUCH A GOO...
1,IT CONSISTS ON INVOLVES VISITING THE LONDON F...,IT IN INVOLVES VISITING THE LONDON FASHION AN...
2,"ON THE OTHER HAND , WE COULD LEARN THE DIFFERE...","ALSO , WE COULD LEARN THE DIFFERENT WAYS TO GE..."
3,I WILL BE WRITING ANXIOUSLY FOR YOUR RESPONSE.,I WILL BE WAITING ANXIOUSLY FOR YOUR RESPONSE.
4,"AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA...","AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA..."
...,...,...
18686,"I'm planning the event on 15 or 22 February,...",I'm planning the event for 15 or 22 February...
18687,"I had never taken birthdays serious , both ...","I had never taken birthdays seriously , neit..."
18688,"When I was 23, I joined a NGO in Denmark and...","When I was 23, I joined an NGO in Denmark an..."
18689,The party was far beyound my expectation .,The party was far beyond my expectations .


In [18]:
for val in range(10):
    print(df.loc[val, "Wrong"],"\n", df.loc[val, "Right"], "\n\n")

I WANT TO  THAK  YOU FOR PREPARING SUCH A GOOD PROGRAMME FOR US AND ESPECIALLY FOR TAKING US  TO  THE RIVER TRIP TO GREENWICH. 
 I WANT TO  THANK  YOU FOR PREPARING SUCH A GOOD PROGRAMME FOR US AND ESPECIALLY FOR TAKING US  ON  THE RIVER TRIP TO GREENWICH. 


IT  CONSISTS ON INVOLVES VISITING THE LONDON FASHION AND LEISURE SHOW  IN  THE CENTRAL EXHIBITION HALL. 
 IT  IN INVOLVES VISITING THE LONDON FASHION AND LEISURE SHOW  AT  THE CENTRAL EXHIBITION HALL. 


ON THE OTHER HAND , WE COULD LEARN THE DIFFERENT WAYS TO GET TO THE CENTRAL EXHIBITION HALL. 
 ALSO , WE COULD LEARN THE DIFFERENT WAYS TO GET TO THE CENTRAL EXHIBITION HALL. 


I WILL BE  WRITING  ANXIOUSLY FOR YOUR RESPONSE. 
 I WILL BE  WAITING  ANXIOUSLY FOR YOUR RESPONSE. 


AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY  ACT  IN THEIR PRIVATE LIVES, WITH THEIR FAMILY OR FRIENDS. 
 AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY  BEHAVE  IN THEIR PRIVATE LIVES, WITH THEIR FAMILY 

# **Cleaning**

In [19]:
def cleaner(df_to_clean):
    df_to_clean['Wrong'] = df_to_clean['Wrong'].str.lower()
    df_to_clean['Right'] = df_to_clean['Right'].str.lower()
    df_to_clean['Wrong'] = df_to_clean['Wrong'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    df_to_clean['Right'] = df_to_clean['Right'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    return df_to_clean


In [20]:
df_cleaned = cleaner(df)
for val in range(10):
    print(df_cleaned.loc[val, "Wrong"],"\n", df_cleaned.loc[val, "Right"], "\n\n")

i want to thak you for preparing such a good programme for us and especially for taking us to the river trip to greenwich. 
 i want to thank you for preparing such a good programme for us and especially for taking us on the river trip to greenwich. 


it consists on involves visiting the london fashion and leisure show in the central exhibition hall. 
 it in involves visiting the london fashion and leisure show at the central exhibition hall. 


on the other hand , we could learn the different ways to get to the central exhibition hall. 
 also , we could learn the different ways to get to the central exhibition hall. 


i will be writing anxiously for your response. 
 i will be waiting anxiously for your response. 


as some of them are considered idols, their fans want to know how they act in their private lives, with their family or friends. 
 as some of them are considered idols, their fans want to know how they behave in their private lives, with their family or friends. 


as a re

In [21]:
df_cleaned

Unnamed: 0,Wrong,Right
0,i want to thak you for preparing such a good p...,i want to thank you for preparing such a good ...
1,it consists on involves visiting the london fa...,it in involves visiting the london fashion and...
2,"on the other hand , we could learn the differe...","also , we could learn the different ways to ge..."
3,i will be writing anxiously for your response.,i will be waiting anxiously for your response.
4,"as some of them are considered idols, their fa...","as some of them are considered idols, their fa..."
...,...,...
18686,"i'm planning the event on 15 or 22 february, f...","i'm planning the event for 15 or 22 february, ..."
18687,"i had never taken birthdays serious , both my ...","i had never taken birthdays seriously , neithe..."
18688,"when i was 23, i joined a ngo in denmark and s...","when i was 23, i joined an ngo in denmark and ..."
18689,the party was far beyound my expectation .,the party was far beyond my expectations .


# **Tockenization**

In [22]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(['<SOS>', '<EOS>'] + df_cleaned['Wrong'].tolist() + df_cleaned['Right'].tolist())

In [23]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'in': 6,
 'of': 7,
 'you': 8,
 'it': 9,
 'that': 10,
 'was': 11,
 'is': 12,
 'for': 13,
 'have': 14,
 'my': 15,
 '.': 16,
 'we': 17,
 ',': 18,
 'be': 19,
 'at': 20,
 'because': 21,
 'would': 22,
 'but': 23,
 'on': 24,
 'with': 25,
 'your': 26,
 'like': 27,
 'will': 28,
 'are': 29,
 'very': 30,
 'about': 31,
 'this': 32,
 'not': 33,
 'as': 34,
 'me': 35,
 'can': 36,
 'had': 37,
 'so': 38,
 'all': 39,
 'they': 40,
 'am': 41,
 'there': 42,
 'if': 43,
 'when': 44,
 'some': 45,
 'our': 46,
 'from': 47,
 'were': 48,
 'which': 49,
 'do': 50,
 'or': 51,
 'show': 52,
 'people': 53,
 'more': 54,
 'think': 55,
 'go': 56,
 'time': 57,
 'what': 58,
 'could': 59,
 'one': 60,
 'good': 61,
 'an': 62,
 'only': 63,
 'also': 64,
 'know': 65,
 'money': 66,
 'first': 67,
 'by': 68,
 'how': 69,
 'been': 70,
 'other': 71,
 'really': 72,
 'she': 73,
 "i'm": 74,
 'he': 75,
 'should': 76,
 'after': 77,
 'has': 78,
 'going': 79,
 'want': 80,
 'see': 81,
 'who': 

In [24]:
wrong_seq = tokenizer.texts_to_sequences(df_cleaned['Wrong'])
right_seq = tokenizer.texts_to_sequences(df_cleaned['Right'])

In [25]:
# Récupérer l'index max utilisé
max_id = max(tokenizer.word_index.values())

# Ajouter manuellement les tokens spéciaux
tokenizer.word_index['<SOS>'] = max_id + 1
tokenizer.word_index['<EOS>'] = max_id + 2

# Ajouter aussi dans index_word pour la reconstruction
tokenizer.index_word[max_id + 1] = '<SOS>'
tokenizer.index_word[max_id + 2] = '<EOS>'

In [26]:
sos = tokenizer.word_index['<SOS>']
eos = tokenizer.word_index['<EOS>']
sos, eos

(19474, 19475)

In [27]:
decoder_input = [[sos] + seq for seq in right_seq]     # <SOS> + phrase
decoder_target = [seq + [eos] for seq in right_seq]    # phrase + <EOS>

In [28]:
len(wrong_seq)

18691

In [29]:
wrong_seq

[[2,
  80,
  3,
  14447,
  8,
  13,
  1068,
  170,
  5,
  61,
  296,
  13,
  96,
  4,
  228,
  13,
  514,
  96,
  3,
  1,
  883,
  272,
  3,
  2463],
 [9, 3566, 24, 3288, 641, 1, 183, 180, 4, 209, 52, 6, 1, 732, 528, 1463],
 [24, 1, 71, 364, 18, 17, 59, 354, 1, 109, 840, 3, 88, 3, 1, 732, 528, 1463],
 [2, 28, 19, 102, 7019, 13, 26, 7796],
 [34,
  45,
  7,
  91,
  29,
  1626,
  7797,
  83,
  1276,
  80,
  3,
  65,
  69,
  40,
  1627,
  6,
  83,
  457,
  2154,
  25,
  83,
  377,
  51,
  597],
 [34,
  5,
  1656,
  3,
  32,
  1705,
  7,
  1517,
  733,
  930,
  205,
  53,
  141,
  4,
  407,
  6,
  105,
  109,
  2953,
  13,
  339,
  4169,
  83,
  1870,
  514,
  1295,
  51,
  7798,
  246,
  83,
  1518,
  311,
  40,
  29,
  24,
  458,
  16],
 [32, 772, 1333, 205, 666, 396, 21, 40, 29, 33, 483, 3, 14, 92, 1911],
 [1,
  100,
  164,
  117,
  12,
  403,
  276,
  40,
  29,
  2687,
  40,
  29,
  95,
  989,
  53,
  10,
  1353,
  3,
  14,
  5,
  457,
  86,
  152,
  733,
  4,
  1276,
  635,
  91,
  39,

In [30]:
decoder_input

[[19474,
  2,
  80,
  3,
  179,
  8,
  13,
  1068,
  170,
  5,
  61,
  296,
  13,
  96,
  4,
  228,
  13,
  514,
  96,
  24,
  1,
  883,
  272,
  3,
  2463],
 [19474, 9, 6, 3288, 641, 1, 183, 180, 4, 209, 52, 20, 1, 732, 528, 1463],
 [19474, 64, 18, 17, 59, 354, 1, 109, 840, 3, 88, 3, 1, 732, 528, 1463],
 [19474, 2, 28, 19, 459, 7019, 13, 26, 7796],
 [19474,
  34,
  45,
  7,
  91,
  29,
  1626,
  7797,
  83,
  1276,
  80,
  3,
  65,
  69,
  40,
  2626,
  6,
  83,
  457,
  2154,
  25,
  83,
  377,
  51,
  597],
 [19474,
  34,
  5,
  1656,
  3,
  32,
  1705,
  13,
  1517,
  733,
  930,
  205,
  53,
  141,
  4,
  407,
  6,
  105,
  109,
  2953,
  13,
  339,
  4169,
  83,
  1870,
  514,
  1295,
  51,
  7798,
  246,
  83,
  1518,
  311,
  40,
  29,
  24,
  224,
  16],
 [19474, 32, 772, 1333, 205, 53, 21, 40, 29, 33, 483, 3, 14, 92, 1911],
 [19474,
  1,
  100,
  164,
  117,
  12,
  403,
  276,
  40,
  29,
  2687,
  40,
  29,
  95,
  989,
  53,
  10,
  1353,
  3,
  14,
  5,
  457,
  86,
  152

In [31]:
decoder_target

[[2,
  80,
  3,
  179,
  8,
  13,
  1068,
  170,
  5,
  61,
  296,
  13,
  96,
  4,
  228,
  13,
  514,
  96,
  24,
  1,
  883,
  272,
  3,
  2463,
  19475],
 [9, 6, 3288, 641, 1, 183, 180, 4, 209, 52, 20, 1, 732, 528, 1463, 19475],
 [64, 18, 17, 59, 354, 1, 109, 840, 3, 88, 3, 1, 732, 528, 1463, 19475],
 [2, 28, 19, 459, 7019, 13, 26, 7796, 19475],
 [34,
  45,
  7,
  91,
  29,
  1626,
  7797,
  83,
  1276,
  80,
  3,
  65,
  69,
  40,
  2626,
  6,
  83,
  457,
  2154,
  25,
  83,
  377,
  51,
  597,
  19475],
 [34,
  5,
  1656,
  3,
  32,
  1705,
  13,
  1517,
  733,
  930,
  205,
  53,
  141,
  4,
  407,
  6,
  105,
  109,
  2953,
  13,
  339,
  4169,
  83,
  1870,
  514,
  1295,
  51,
  7798,
  246,
  83,
  1518,
  311,
  40,
  29,
  24,
  224,
  16,
  19475],
 [32, 772, 1333, 205, 53, 21, 40, 29, 33, 483, 3, 14, 92, 1911, 19475],
 [1,
  100,
  164,
  117,
  12,
  403,
  276,
  40,
  29,
  2687,
  40,
  29,
  95,
  989,
  53,
  10,
  1353,
  3,
  14,
  5,
  457,
  86,
  152,
  733,


In [32]:
MAX_LEN = max(
    max(len(x) for x in wrong_seq),
    max(len(x) for x in decoder_input),
    max(len(x) for x in decoder_target)
)

encoder_input_padded = pad_sequences(wrong_seq, maxlen=MAX_LEN, padding='post')
decoder_input_padded = pad_sequences(decoder_input, maxlen=MAX_LEN, padding='post')
decoder_target_padded = pad_sequences(decoder_target, maxlen=MAX_LEN, padding='post')

In [33]:
encoder_input_padded.shape

(18691, 98)

# **Splitting**

In [34]:
X_train_enc, X_test_enc, X_train_dec, X_test_dec, y_train, y_test = train_test_split(
    encoder_input_padded, decoder_input_padded, decoder_target_padded,
    test_size=0.2, random_state=42
)

In [35]:
X_train_enc.shape, X_test_enc.shape, X_train_dec.shape, X_test_dec.shape, y_train.shape, y_test.shape

((14952, 98), (3739, 98), (14952, 98), (3739, 98), (14952, 98), (3739, 98))

# **Training**

In [36]:
vocab_size = len(tokenizer.word_index) + 1
max_len = X_train_enc.shape[1]
vocab_size, max_len

(19476, 98)

In [37]:
# Taille des paramètres
embedding_dim = 128
latent_dim = 256

In [38]:
# === ENCODEUR ===
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]





In [39]:
# === DECODEUR ===
decoder_inputs = Input(shape=(max_len,))
dec_emb_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [40]:
# === MODELE FINAL ===
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 98)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 98)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 98, 128)              2492928   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 98, 128)              2492928   ['input_2[0][0]']             
                                                                                             

In [41]:
model.fit(
    [X_train_enc, X_train_dec], y_train,
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    verbose = 2
)


Epoch 1/20




ResourceExhaustedError: Graph execution error:

Detected at node model/dense/Softmax defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\HP\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "c:\Users\HP\anaconda3\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\HP\anaconda3\Lib\asyncio\windows_events.py", line 321, in run_forever

  File "c:\Users\HP\anaconda3\Lib\asyncio\base_events.py", line 607, in run_forever

  File "c:\Users\HP\anaconda3\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "c:\Users\HP\anaconda3\Lib\asyncio\events.py", line 80, in _run

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "c:\Users\HP\anaconda3\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\HP\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\HP\AppData\Local\Temp\ipykernel_19360\2254445901.py", line 1, in <module>

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 1150, in train_step

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\training.py", line 590, in __call__

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\functional.py", line 515, in call

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\functional.py", line 672, in _run_internal_graph

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\layers\core\dense.py", line 255, in call

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\activations.py", line 87, in softmax

  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\keras\src\backend.py", line 5448, in softmax

OOM when allocating tensor with shape[64,98,19476] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node model/dense/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_13995]

In [47]:
X_train

array([[   2,   33,  441, ...,    0,    0,    0],
       [   1,  235,   31, ...,    0,    0,    0],
       [  80,   10,    2, ...,    0,    0,    0],
       ...,
       [  26,  262,    0, ...,    0,    0,    0],
       [ 271, 2827,   83, ...,    0,    0,    0],
       [3122,    2,   20, ...,    0,    0,    0]])

In [None]:
X_train

In [81]:
pred = model.predict([X_train[:1], y_train[:1]])



In [62]:
import numpy as np

In [63]:
pred = model.predict([X_test[0], np.zeros((1, max_len))])

ValueError: Data cardinality is ambiguous:
  x sizes: 98, 1
Make sure all arrays contain the same number of samples.

In [69]:
sentence = "i want to thak you for preparing such a good presentation"

In [79]:
seq = tokenizer.texts_to_sequences([sentence])
padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')

pred = model.predict([padded_seq, np.zeros((1, max_len))])




In [73]:
padded_seq

array([[   2,   79,    3, 8740,    8,   12, 1009,  191,    5,   60, 2545,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [77]:
decode_sequence(padded_seq[0])

'i want to thak you for preparing such a good presentation'

In [80]:
np.argmax(pred[0], axis=-1)

array([920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920], dtype=int64)

In [71]:
pred_indices = np.argmax(pred[0], axis=-1)

In [72]:
predicted_sentence = " ".join(tokenizer.index_word.get(i, "") for i in pred_indices if i != 0)
print(predicted_sentence)

waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste


In [66]:
pred

array([[[7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        ...,
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05]]], dtype=float32)

In [57]:
X_test[1]

array([  1, 138,   6,  15, 228,  14, 359,  65,  84,  12,   1, 119, 186,
         4, 232,  39,   4,  16,  20,  25,   3, 161,   8,  42,   9,  13,
       230,   3,  94,  46, 705,   6,   1, 239,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [49]:
pred

array([[[2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        ...,
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08]]], dtype=float32)

In [50]:
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}

def decode_sequence(seq):
    return ' '.join([reverse_word_index.get(idx, '') for idx in seq if idx != 0])


In [82]:
print("Wrong:", decode_sequence(X_train[0]))
print("Predicted correction:", decode_sequence(pred.argmax(axis=-1)[0]))
print("Right:", decode_sequence(y_train[0]))

Wrong: i all happened a year ago when pat and i were still best friends and used to tell each other everything
Predicted correction: it all happened a year ago when pat and i were still best friends and used to tell each other everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything eve

In [17]:
df_cleaned_2 = pd.DataFrame({"Wrong": wrong_padded, "Right": right_padded})
df_cleaned_2

ValueError: Per-column arrays must each be 1-dimensional