# Text Correction Model

## Check if i am using the gpu

In [1]:
from tensorflow.test import is_gpu_available

In [2]:
is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

## Load the data

In [3]:
from datasets import load_dataset

dataset = load_dataset("jhu-clsp/jfleg")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})

In [5]:
import pandas as pd
train_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

In [6]:
train_df.head()

Unnamed: 0,sentence,corrections
0,So I think we can not live if old people could...,[So I think we would not be alive if our ances...
1,For not use car .,"[Not for use with a car . , Do not use in the ..."
2,Here was no promise of morning except that we ...,"[Here was no promise of morning , except that ..."
3,Thus even today sex is considered as the least...,"[Thus , even today , sex is considered as the ..."
4,image you salf you are wark in factory just to...,[Imagine yourself you are working in factory j...


In [7]:
wrap = train_df.iloc[0]
wrap

sentence       So I think we can not live if old people could...
corrections    [So I think we would not be alive if our ances...
Name: 0, dtype: object

In [8]:
print(f"Sentence: ", wrap[0])
print(f"Corrections:")
for i in wrap[1]:
    print(i)

Sentence:  So I think we can not live if old people could not find siences and tecnologies and they did not developped . 
Corrections:
So I think we would not be alive if our ancestors did not develop sciences and technologies . 
So I think we could not live if older people did not develop science and technologies . 
So I think we can not live if old people could not find science and technologies and they did not develop . 
So I think we can not live if old people can not find the science and technology that has not been developed . 


  print(f"Sentence: ", wrap[0])
  for i in wrap[1]:


In [9]:
train_df

Unnamed: 0,sentence,corrections
0,So I think we can not live if old people could...,[So I think we would not be alive if our ances...
1,For not use car .,"[Not for use with a car . , Do not use in the ..."
2,Here was no promise of morning except that we ...,"[Here was no promise of morning , except that ..."
3,Thus even today sex is considered as the least...,"[Thus , even today , sex is considered as the ..."
4,image you salf you are wark in factory just to...,[Imagine yourself you are working in factory j...
...,...,...
750,The government also should try to reduce the s...,[The government should also try to reduce the ...
751,Alot of memories with enogh time to remember w...,"[A lot of memories , with enough time to remem..."
752,Sceene of violence can affect on them .,[A scene of violence can have an effect on the...
753,While the communities in general have reckoned...,[The communities in general have reckoned that...


In [10]:
train_df['text'] = [' '.join([sentence, *corrections]) for sentence, corrections in zip(train_df['sentence'], train_df['corrections'])]


In [11]:
train_df

Unnamed: 0,sentence,corrections,text
0,So I think we can not live if old people could...,[So I think we would not be alive if our ances...,So I think we can not live if old people could...
1,For not use car .,"[Not for use with a car . , Do not use in the ...",For not use car . Not for use with a car . D...
2,Here was no promise of morning except that we ...,"[Here was no promise of morning , except that ...",Here was no promise of morning except that we ...
3,Thus even today sex is considered as the least...,"[Thus , even today , sex is considered as the ...",Thus even today sex is considered as the least...
4,image you salf you are wark in factory just to...,[Imagine yourself you are working in factory j...,image you salf you are wark in factory just to...
...,...,...,...
750,The government also should try to reduce the s...,[The government should also try to reduce the ...,The government also should try to reduce the s...
751,Alot of memories with enogh time to remember w...,"[A lot of memories , with enough time to remem...",Alot of memories with enogh time to remember w...
752,Sceene of violence can affect on them .,[A scene of violence can have an effect on the...,Sceene of violence can affect on them . A sce...
753,While the communities in general have reckoned...,[The communities in general have reckoned that...,While the communities in general have reckoned...


In [12]:
wrap = train_df.iloc[0]
wrap['text']

'So I think we can not live if old people could not find siences and tecnologies and they did not developped .  So I think we would not be alive if our ancestors did not develop sciences and technologies .  So I think we could not live if older people did not develop science and technologies .  So I think we can not live if old people could not find science and technologies and they did not develop .  So I think we can not live if old people can not find the science and technology that has not been developed . '

In [13]:
max_length = train_df['sentence'].str.len().max()
max_length

411

In [14]:
from tensorflow.keras.layers import TextVectorization
vocab_size = 25000
seq_len = 500

text_vectorizer = TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = seq_len
)

In [15]:
text_vectorizer.adapt(train_df['text'])

In [16]:
test_wrap = test_df.iloc[0]

In [17]:
text_vectorizer(test_wrap['sentence'])

<tf.Tensor: shape=(500,), dtype=int64, numpy=
array([  67,    4,   67,  422,   61,  143, 1502,    3,    2,  172,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [18]:
import numpy as np
np.array(wrap['corrections']).shape

(4,)

In [19]:
for i in range(4): 
    train_df[f'correction_{i+1}'] = train_df['corrections'].apply(lambda x: x[i] if i < len(x) else None)


In [20]:
train_df

Unnamed: 0,sentence,corrections,text,correction_1,correction_2,correction_3,correction_4
0,So I think we can not live if old people could...,[So I think we would not be alive if our ances...,So I think we can not live if old people could...,So I think we would not be alive if our ancest...,So I think we could not live if older people d...,So I think we can not live if old people could...,So I think we can not live if old people can n...
1,For not use car .,"[Not for use with a car . , Do not use in the ...",For not use car . Not for use with a car . D...,Not for use with a car .,Do not use in the car .,Car not for use .,Can not use the car .
2,Here was no promise of morning except that we ...,"[Here was no promise of morning , except that ...",Here was no promise of morning except that we ...,"Here was no promise of morning , except that w...","Here , there was no promise of morning , excep...",Here was no promise of morning except that we ...,There was no promise of morning except when we...
3,Thus even today sex is considered as the least...,"[Thus , even today , sex is considered as the ...",Thus even today sex is considered as the least...,"Thus , even today , sex is considered as the l...","Thus , even today , sex is considered the leas...","Thus , even today , sex is considered the leas...","Thus , even today sex is considered as the lea..."
4,image you salf you are wark in factory just to...,[Imagine yourself you are working in factory j...,image you salf you are wark in factory just to...,Imagine yourself you are working in factory ju...,Imagine that you work in a factory and do just...,image you salf you are wark in factory just to...,Imagine yourself working in a factory. You are...
...,...,...,...,...,...,...,...
750,The government also should try to reduce the s...,[The government should also try to reduce the ...,The government also should try to reduce the s...,The government should also try to reduce the s...,The government should also try to reduce the s...,The government should also try to reduce the s...,The government should also try to reduce the s...
751,Alot of memories with enogh time to remember w...,"[A lot of memories , with enough time to remem...",Alot of memories with enogh time to remember w...,"A lot of memories , with enough time to rememb...",Many memories with enough time to remember wil...,"A lot of memories , with enough time to rememb...",A lot of memories with enough time to remember...
752,Sceene of violence can affect on them .,[A scene of violence can have an effect on the...,Sceene of violence can affect on them . A sce...,A scene of violence can have an effect on them .,Scenes of violence can have an affect on them .,Scenes of violence can have an effect on them .,Scenes of violence can affect them .
753,While the communities in general have reckoned...,[The communities in general have reckoned that...,While the communities in general have reckoned...,The communities in general have reckoned that ...,While the communities in general have reckoned...,While the communities in general have recogniz...,While the communities in general believe that ...


## Model 1: For one correction

In [21]:
import tensorflow as tf

In [72]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['text'])

In [74]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

3167

In [75]:
X_train = tokenizer.texts_to_sequences(train_df['sentence'])
y_train = tokenizer.texts_to_sequences(train_df['correction_1'])

In [77]:
X_train[0]

[33,
 10,
 57,
 23,
 22,
 19,
 119,
 28,
 111,
 15,
 100,
 19,
 151,
 2427,
 3,
 2428,
 3,
 11,
 228,
 19,
 2429]

In [78]:
max_length = max(max(len(seq) for seq in X_train), max(len(seq) for seq in y_train))
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
y_train = pad_sequences(y_train, maxlen=max_length, padding='post')

In [80]:
X_train[0]

array([  33,   10,   57,   23,   22,   19,  119,   28,  111,   15,  100,
         19,  151, 2427,    3, 2428,    3,   11,  228,   19, 2429,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [82]:
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape
from tensorflow.keras.models import Model
from tensorflow import clip_by_value

inputs = Input(shape=(max_length,), dtype='int32')
x = Embedding(vocab_size, 256)(inputs)
x = GRU(128, return_sequences=True)(x)
x = Dense(vocab_size, activation = 'softmax')(x)  
model = Model(inputs, x)

In [83]:
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 76)]              0         
                                                                 
 embedding_11 (Embedding)    (None, 76, 256)           810752    
                                                                 
 gru_21 (GRU)                (None, 76, 128)           148224    
                                                                 
 dense_11 (Dense)            (None, 76, 3167)          408543    
                                                                 
Total params: 1,367,519
Trainable params: 1,367,519
Non-trainable params: 0
_________________________________________________________________


In [86]:
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

In [88]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e7921fd420>

In [99]:
new_sentence = "Hello, is everything alright?"
new_sequence = tokenizer.texts_to_sequences([new_sentence])
padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding='post')
predictions = model.predict(padded_sequence)
predicted_sentence = []
for timestep in predictions[0]:
    predicted_word_index = np.argmax(timestep)
    predicted_word = tokenizer.index_word.get(predicted_word_index, "")
    predicted_sentence.append(predicted_word)

# Join the predicted words to form the corrected sentence
corrected_sentence = " ".join(predicted_sentence)

corrected_sentence



'the a                                                                          '