In [1]:
from datasets import load_dataset
from transformers import create_optimizer , T5TokenizerFast , DataCollatorForSeq2Seq , TFT5ForConditionalGeneration , TFAutoModelForSeq2SeqLM




In [2]:
dataset_id = "leslyarun/c4_200m_gec_train100k_test25k"

In [3]:
dataset = load_dataset(dataset_id)

In [4]:
dataset['train'][0]

{'input': 'Bitcoin is for $7,094 this morning, which CoinDesk says.',
 'output': 'Bitcoin goes for $7,094 this morning, according to CoinDesk.'}

In [5]:
model_id = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_id)



In [6]:
def preprocess_function(example):
    inputs = [this for this in example['input']]
    targets = [this for this in example['output']]
    model_inputs = tokenizer(inputs, text_target = targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True,remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25000
    })
})

In [9]:
tokenized_dataset['train'][1000]

{'input_ids': [6844, 3, 18, 3, 26, 51, 226, 3, 14198, 169, 53, 39, 1339, 5, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [6844, 3, 26, 51, 226, 3, 14198, 3, 18, 150, 1339, 831, 5, 1]}

In [10]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_id)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model,return_tensors='tf')




All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [11]:
#i reduce the batch size because my computer dont have enought memory 
BATCH_SIZE = 16
MAX_LENGTH = 128

In [12]:
tf_train_dataset = tokenized_dataset['train'].to_tf_dataset(
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
  )

In [13]:
valid_dataset = tokenized_dataset['test'].to_tf_dataset(batch_size=BATCH_SIZE,collate_fn=data_collator)

In [14]:
for i in valid_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(16, 64), dtype=int64, numpy=
array([[9139,  138, 2253, ...,    0,    0,    0],
       [9627,   37, 5791, ...,    0,    0,    0],
       [ 116,   27, 4839, ...,    0,    0,    0],
       ...,
       [  37,  304,   63, ...,    0,    0,    0],
       [  94,  808,  140, ...,    0,    0,    0],
       [6465,   13,  306, ...,    0,    0,    0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(16, 64), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>, 'labels': <tf.Tensor: shape=(16, 63), dtype=int64, numpy=
array([[   37, 13477,  2253, ...,  -100,  -100,  -100],
       [ 9627,    37,  5791, ...,  -100,  -100,  -100],
       [    3,    99,    27, ...,  -100,  -100,  -100],
       ...,
       [   37,   304,    63, ...,  -100,  -100,  -100],
       [   94,   808,   140, .

In [15]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60506624 (230.81 MB)
Trainable params: 60506624 (230.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
#!reduce epochs for less time training and ressources
num_epochs = 1
num_train_steps = len(tf_train_dataset) // BATCH_SIZE * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps
)

model.compile(optimizer=optimizer)


In [17]:
#this take a lot of time and compute power!!!!
# i did not  run it .
model.fit(
    tf_train_dataset,
    validation_data=valid_dataset,
    epochs=num_epochs
)


Cause: for/else statement not yet supported
Cause: for/else statement not yet supported

  24/6250 [..............................] - ETA: 9:02:45 - loss: 1.2718


KeyboardInterrupt



In [None]:
#input_text ="Haves you every played soccer under the rain ,with your friends?"
#tokenized = tokenizer([input_text],return_tensors='tf')
#output =model.generate(**tokenized,max_lenght=128)
#print(output)

In [17]:
wrong = [
   "He go to the store yesterday.",
    "She don't likes ice cream.",
    "The book are on the table.",
    "We is going to the park tomorrow.",
    "He don't have no money.",
    "She is a good cooker.",
    "I seen him yesterday.",
    "They was playing football.",
    "She sing good.",
    "The dog barks loud.",
    "He speak three languages good.",

]

In [None]:
#for i in range(len(wrong)):
# print(wrong[i]+"----"+tokenizer.decode(output[i],skip_special_tokens=True))


# **#since traing required lot of time and compute ressources  , i use a pretrained model******

In [18]:
#sine traing required lot of time and compute ressources  , i use a pretrained model
from transformers import AutoModelForSeq2SeqLM
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained("juancavallotti/t5-base-gec")



In [19]:
tokenized = tokenizer(
    wrong,
    padding="longest",
    truncation=True,
    return_tensors='pt'
)
output=pretrained_model.generate(**tokenized,max_length=128)


In [20]:
print(output)

tensor([[    0,   216,   877,    12,     8,  1078,  4981,     3,     5,     1,
             0,     0],
        [    0,   451,   744,    31,    17,   114,     3,   867,  3022,     3,
             5,     1],
        [    0,    37,  1335,    33,    30,     8,   953,     5,     1,     0,
             0,     0],
        [    0,   101,    33,   352,    12,     8,  2447,  5721,     3,     5,
             1,     0],
        [    0,   216,   744,    31,    17,    43,   136,   540,     5,     1,
             0,     0],
        [    0,   451,    19,     3,     9,   207,  3989,     3,     5,     1,
             0,     0],
        [    0,    27,  1509,   376,  4981,     3,     5,     1,     0,     0,
             0,     0],
        [    0,   328,   130,  1556,  3370,     3,     5,     1,     0,     0,
             0,     0],
        [    0,   451, 10159,     7,   168,     5,     1,     0,     0,     0,
             0,     0],
        [    0,    37,  1782, 21696,     7,  8002,   120,     3,     5,  

In [21]:
for line in wrong:
 print(line)

print("\n<----------Corrected Version------------->\n")

for i in range(len(wrong)):
 print(tokenizer.decode(output[i],skip_special_tokens=True))


He go to the store yesterday.
She don't likes ice cream.
The book are on the table.
We is going to the park tomorrow.
He don't have no money.
She is a good cooker.
I seen him yesterday.
They was playing football.
She sing good.
The dog barks loud.
He speak three languages good.

<----------Corrected Version------------->

He went to the store yesterday.
She doesn't like ice cream.
The books are on the table.
We are going to the park tomorrow.
He doesn't have any money.
She is a good cook.
I saw him yesterday.
They were playing football.
She sings well.
The dog barks loudly.
He speaks three languages well.
