In [40]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding



# **Exploration**

In [25]:
df = pd.read_csv("automatic_correction_dataset.csv")
df

Unnamed: 0,Wrong,Right
0,I WANT TO THAK YOU FOR PREPARING SUCH A GOOD...,I WANT TO THANK YOU FOR PREPARING SUCH A GOO...
1,IT CONSISTS ON INVOLVES VISITING THE LONDON F...,IT IN INVOLVES VISITING THE LONDON FASHION AN...
2,"ON THE OTHER HAND , WE COULD LEARN THE DIFFERE...","ALSO , WE COULD LEARN THE DIFFERENT WAYS TO GE..."
3,I WILL BE WRITING ANXIOUSLY FOR YOUR RESPONSE.,I WILL BE WAITING ANXIOUSLY FOR YOUR RESPONSE.
4,"AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA...","AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FA..."
...,...,...
18686,"I'm planning the event on 15 or 22 February,...",I'm planning the event for 15 or 22 February...
18687,"I had never taken birthdays serious , both ...","I had never taken birthdays seriously , neit..."
18688,"When I was 23, I joined a NGO in Denmark and...","When I was 23, I joined an NGO in Denmark an..."
18689,The party was far beyound my expectation .,The party was far beyond my expectations .


In [26]:
for val in range(10):
    print(df.loc[val, "Wrong"],"\n", df.loc[val, "Right"], "\n\n")

I WANT TO  THAK  YOU FOR PREPARING SUCH A GOOD PROGRAMME FOR US AND ESPECIALLY FOR TAKING US  TO  THE RIVER TRIP TO GREENWICH. 
 I WANT TO  THANK  YOU FOR PREPARING SUCH A GOOD PROGRAMME FOR US AND ESPECIALLY FOR TAKING US  ON  THE RIVER TRIP TO GREENWICH. 


IT  CONSISTS ON INVOLVES VISITING THE LONDON FASHION AND LEISURE SHOW  IN  THE CENTRAL EXHIBITION HALL. 
 IT  IN INVOLVES VISITING THE LONDON FASHION AND LEISURE SHOW  AT  THE CENTRAL EXHIBITION HALL. 


ON THE OTHER HAND , WE COULD LEARN THE DIFFERENT WAYS TO GET TO THE CENTRAL EXHIBITION HALL. 
 ALSO , WE COULD LEARN THE DIFFERENT WAYS TO GET TO THE CENTRAL EXHIBITION HALL. 


I WILL BE  WRITING  ANXIOUSLY FOR YOUR RESPONSE. 
 I WILL BE  WAITING  ANXIOUSLY FOR YOUR RESPONSE. 


AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY  ACT  IN THEIR PRIVATE LIVES, WITH THEIR FAMILY OR FRIENDS. 
 AS SOME OF THEM ARE CONSIDERED IDOLS, THEIR FANS WANT TO KNOW HOW THEY  BEHAVE  IN THEIR PRIVATE LIVES, WITH THEIR FAMILY 

# **Cleaning**

In [27]:
def cleaner(df_to_clean):
    df_to_clean['Wrong'] = df_to_clean['Wrong'].str.lower()
    df_to_clean['Right'] = df_to_clean['Right'].str.lower()
    df_to_clean['Wrong'] = df_to_clean['Wrong'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    df_to_clean['Right'] = df_to_clean['Right'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    return df_to_clean


In [28]:
df_cleaned = cleaner(df)
for val in range(10):
    print(df_cleaned.loc[val, "Wrong"],"\n", df_cleaned.loc[val, "Right"], "\n\n")

i want to thak you for preparing such a good programme for us and especially for taking us to the river trip to greenwich. 
 i want to thank you for preparing such a good programme for us and especially for taking us on the river trip to greenwich. 


it consists on involves visiting the london fashion and leisure show in the central exhibition hall. 
 it in involves visiting the london fashion and leisure show at the central exhibition hall. 


on the other hand , we could learn the different ways to get to the central exhibition hall. 
 also , we could learn the different ways to get to the central exhibition hall. 


i will be writing anxiously for your response. 
 i will be waiting anxiously for your response. 


as some of them are considered idols, their fans want to know how they act in their private lives, with their family or friends. 
 as some of them are considered idols, their fans want to know how they behave in their private lives, with their family or friends. 


as a re

In [29]:
df_cleaned

Unnamed: 0,Wrong,Right
0,i want to thak you for preparing such a good p...,i want to thank you for preparing such a good ...
1,it consists on involves visiting the london fa...,it in involves visiting the london fashion and...
2,"on the other hand , we could learn the differe...","also , we could learn the different ways to ge..."
3,i will be writing anxiously for your response.,i will be waiting anxiously for your response.
4,"as some of them are considered idols, their fa...","as some of them are considered idols, their fa..."
...,...,...
18686,"i'm planning the event on 15 or 22 february, f...","i'm planning the event for 15 or 22 february, ..."
18687,"i had never taken birthdays serious , both my ...","i had never taken birthdays seriously , neithe..."
18688,"when i was 23, i joined a ngo in denmark and s...","when i was 23, i joined an ngo in denmark and ..."
18689,the party was far beyound my expectation .,the party was far beyond my expectations .


# **Tockenization**

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_cleaned['Wrong'].tolist() + df_cleaned['Right'].tolist())

In [31]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'in': 6,
 'of': 7,
 'you': 8,
 'it': 9,
 'that': 10,
 'was': 11,
 'for': 12,
 'is': 13,
 'have': 14,
 'my': 15,
 'we': 16,
 'be': 17,
 'because': 18,
 'at': 19,
 'would': 20,
 'but': 21,
 'me': 22,
 'on': 23,
 'with': 24,
 'like': 25,
 'your': 26,
 'are': 27,
 'will': 28,
 'this': 29,
 'very': 30,
 'not': 31,
 'about': 32,
 'all': 33,
 'as': 34,
 'there': 35,
 'so': 36,
 'can': 37,
 'had': 38,
 'show': 39,
 'they': 40,
 'am': 41,
 'if': 42,
 'when': 43,
 'time': 44,
 'people': 45,
 'some': 46,
 'our': 47,
 'do': 48,
 'from': 49,
 'were': 50,
 'which': 51,
 'or': 52,
 'more': 53,
 'think': 54,
 'go': 55,
 'one': 56,
 'what': 57,
 'could': 58,
 'money': 59,
 'good': 60,
 'also': 61,
 'know': 62,
 'life': 63,
 'first': 64,
 'an': 65,
 'only': 66,
 'them': 67,
 'shopping': 68,
 'how': 69,
 'by': 70,
 'other': 71,
 'clothes': 72,
 'been': 73,
 'she': 74,
 'really': 75,
 'he': 76,
 "i'm": 77,
 'us': 78,
 'want': 79,
 'after': 80,
 'should': 

In [32]:
wrong_seq = tokenizer.texts_to_sequences(df_cleaned['Wrong'])
right_seq = tokenizer.texts_to_sequences(df_cleaned['Right'])

In [33]:
len(wrong_seq)

18691

In [34]:
wrong_seq

[[2,
  79,
  3,
  8740,
  8,
  12,
  1009,
  191,
  5,
  60,
  239,
  12,
  78,
  4,
  247,
  12,
  540,
  78,
  3,
  1,
  741,
  264,
  3,
  921],
 [9, 2798, 23, 2661, 654, 1, 119, 186, 4, 232, 39, 6, 1, 710, 528, 432],
 [23, 1, 71, 249, 16, 58, 385, 1, 115, 661, 3, 97, 3, 1, 710, 528, 432],
 [2, 28, 17, 111, 4913, 12, 26, 1414],
 [34,
  46,
  7,
  67,
  27,
  1436,
  5606,
  91,
  1045,
  79,
  3,
  62,
  69,
  40,
  1289,
  6,
  91,
  487,
  337,
  24,
  91,
  310,
  52,
  151],
 [34,
  5,
  1414,
  3,
  29,
  1437,
  7,
  128,
  647,
  847,
  216,
  45,
  101,
  4,
  304,
  6,
  113,
  115,
  661,
  12,
  241,
  3199,
  91,
  693,
  540,
  1057,
  52,
  4129,
  284,
  91,
  1070,
  336,
  40,
  27,
  23,
  402],
 [29, 630, 1207, 216, 682, 337, 18, 40, 27, 31, 515, 3, 14, 98, 1122],
 [1,
  109,
  176,
  125,
  13,
  10,
  306,
  40,
  27,
  216,
  40,
  27,
  107,
  922,
  45,
  10,
  1230,
  3,
  14,
  5,
  487,
  63,
  171,
  647,
  4,
  1045,
  631,
  67,
  33,
  1,
  44,
  4,
  

In [35]:
MAX_LEN = max(max(len(x) for x in wrong_seq), max(len(x) for x in right_seq))
wrong_padded = pad_sequences(wrong_seq, padding='post', maxlen=MAX_LEN)
right_padded = pad_sequences(right_seq, padding='post', maxlen=MAX_LEN)

In [36]:
wrong_padded.shape

(18691, 98)

# **Splitting**

In [38]:
X_train, X_test, y_train, y_test = train_test_split(wrong_padded, right_padded, test_size=0.2, random_state=42)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14952, 98), (3739, 98), (14952, 98), (3739, 98))

# **Training**

In [41]:
vocab_size = len(tokenizer.word_index) + 1
max_len = X_train.shape[1]
vocab_size, max_len

(13193, 98)

In [42]:
# Taille des paramètres
embedding_dim = 128
latent_dim = 256

In [43]:
# === ENCODEUR ===
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]





In [44]:
# === DECODEUR ===
decoder_inputs = Input(shape=(max_len,))
dec_emb_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [45]:
# === MODELE FINAL ===
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 98)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 98)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 98, 128)              1688704   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 98, 128)              1688704   ['input_2[0][0]']             
                                                                                             

In [46]:
model.fit(
    [X_train, y_train],
    y_train,
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    verbose = 2
)


Epoch 1/20


187/187 - 431s - loss: 6.4320 - accuracy: 0.0604 - val_loss: 5.6484 - val_accuracy: 0.0590 - 431s/epoch - 2s/step
Epoch 2/20
187/187 - 414s - loss: 4.9203 - accuracy: 0.1658 - val_loss: 4.1411 - val_accuracy: 0.3616 - 414s/epoch - 2s/step
Epoch 3/20
187/187 - 384s - loss: 3.2912 - accuracy: 0.5262 - val_loss: 2.7779 - val_accuracy: 0.6266 - 384s/epoch - 2s/step
Epoch 4/20
187/187 - 394s - loss: 2.2650 - accuracy: 0.7072 - val_loss: 2.0371 - val_accuracy: 0.7501 - 394s/epoch - 2s/step
Epoch 5/20
187/187 - 398s - loss: 1.6971 - accuracy: 0.7872 - val_loss: 1.6219 - val_accuracy: 0.8036 - 398s/epoch - 2s/step
Epoch 6/20
187/187 - 402s - loss: 1.3615 - accuracy: 0.8276 - val_loss: 1.3547 - val_accuracy: 0.8416 - 402s/epoch - 2s/step
Epoch 7/20
187/187 - 410s - loss: 1.1202 - accuracy: 0.8633 - val_loss: 1.1431 - val_accuracy: 0.8738 - 410s/epoch - 2s/step
Epoch 8/20
187/187 - 413s - loss: 0.9188 - accuracy: 0.8934 - val_loss: 0.9653 - val_accuracy: 0.9031 - 413s/epoch - 2s/ste

<keras.src.callbacks.History at 0x1967f222990>

In [47]:
X_train

array([[   2,   33,  441, ...,    0,    0,    0],
       [   1,  235,   31, ...,    0,    0,    0],
       [  80,   10,    2, ...,    0,    0,    0],
       ...,
       [  26,  262,    0, ...,    0,    0,    0],
       [ 271, 2827,   83, ...,    0,    0,    0],
       [3122,    2,   20, ...,    0,    0,    0]])

In [None]:
X_train

In [81]:
pred = model.predict([X_train[:1], y_train[:1]])



In [62]:
import numpy as np

In [63]:
pred = model.predict([X_test[0], np.zeros((1, max_len))])

ValueError: Data cardinality is ambiguous:
  x sizes: 98, 1
Make sure all arrays contain the same number of samples.

In [69]:
sentence = "i want to thak you for preparing such a good presentation"

In [79]:
seq = tokenizer.texts_to_sequences([sentence])
padded_seq = pad_sequences(seq, maxlen=max_len, padding='post')

pred = model.predict([padded_seq, np.zeros((1, max_len))])




In [73]:
padded_seq

array([[   2,   79,    3, 8740,    8,   12, 1009,  191,    5,   60, 2545,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [77]:
decode_sequence(padded_seq[0])

'i want to thak you for preparing such a good presentation'

In [80]:
np.argmax(pred[0], axis=-1)

array([920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
       920, 920, 920, 920, 920, 920, 920], dtype=int64)

In [71]:
pred_indices = np.argmax(pred[0], axis=-1)

In [72]:
predicted_sentence = " ".join(tokenizer.index_word.get(i, "") for i in pred_indices if i != 0)
print(predicted_sentence)

waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste waste


In [66]:
pred

array([[[7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        ...,
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05],
        [7.0848240e-05, 6.8962945e-05, 7.1865303e-05, ...,
         8.0895559e-05, 8.2913939e-05, 7.5476943e-05]]], dtype=float32)

In [57]:
X_test[1]

array([  1, 138,   6,  15, 228,  14, 359,  65,  84,  12,   1, 119, 186,
         4, 232,  39,   4,  16,  20,  25,   3, 161,   8,  42,   9,  13,
       230,   3,  94,  46, 705,   6,   1, 239,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [49]:
pred

array([[[2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        ...,
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08],
        [2.8436728e-10, 7.5260746e-06, 2.4960020e-06, ...,
         1.7173313e-08, 4.3347072e-08, 3.2653368e-08]]], dtype=float32)

In [50]:
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}

def decode_sequence(seq):
    return ' '.join([reverse_word_index.get(idx, '') for idx in seq if idx != 0])


In [82]:
print("Wrong:", decode_sequence(X_train[0]))
print("Predicted correction:", decode_sequence(pred.argmax(axis=-1)[0]))
print("Right:", decode_sequence(y_train[0]))

Wrong: i all happened a year ago when pat and i were still best friends and used to tell each other everything
Predicted correction: it all happened a year ago when pat and i were still best friends and used to tell each other everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything everything eve

In [17]:
df_cleaned_2 = pd.DataFrame({"Wrong": wrong_padded, "Right": right_padded})
df_cleaned_2

ValueError: Per-column arrays must each be 1-dimensional