In [None]:
!pip install transformers datasets evaluate
!pip install sentencepiece

In [None]:
!pip install sacrebleu

In [None]:
import tensorflow as tf
import numpy as np
import io
import os
import re
import matplotlib.pyplot as plt
import string
import evaluate
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import create_optimizer,T5TokenizerFast,DataCollatorForSeq2Seq,TFT5ForConditionalGeneration,TFAutoModelForSeq2SeqLM,AutoModelForSeq2SeqLM,TFT5ForConditionalGeneration

In [None]:
BATCH_SIZE=64
MAX_LENGTH=128

# Data Preparation

## Downloading

In [None]:
dataset_id="leslyarun/c4_200m_gec_train100k_test25k"

In [None]:
dataset = load_dataset(dataset_id)

In [None]:
dataset

In [None]:
dataset['train'][0]

## Tokenization

In [None]:
model_id="t5-small"
tokenizer=T5TokenizerFast.from_pretrained(model_id)

In [None]:
def preprocess_function(examples):

  inputs = [example for example in examples['input']]
  targets = [example for example in examples['output']]

  model_inputs = tokenizer(inputs, text_target=targets,max_length=MAX_LENGTH, truncation=True)
  return model_inputs

In [None]:
tokenized_dataset=dataset.map(preprocess_function,batched=True,remove_columns=dataset["train"].column_names)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][1000]

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_id)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model, return_tensors="tf")

In [None]:
train_dataset=tokenized_dataset["train"].to_tf_dataset(
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

In [None]:
val_dataset=tokenized_dataset["test"].to_tf_dataset(
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

In [None]:
for i in val_dataset.take(1):
  print(i)

# Model

In [None]:
model.summary()

# Training

In [None]:
num_epochs = 5
num_train_steps=len(train_dataset)*num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
)
model.compile(optimizer=optimizer)

In [None]:
history=model.fit(
  train_dataset,
  validation_data=val_dataset,
  epochs=num_epochs
)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
model.save_weights('/content/drive/MyDrive/nlp/gec/t5-small.h5')

# Evaluation

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
all_preds = []
all_labels = []

for batch in val_dataset.take(5):
  predictions = model.generate(
      input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
  )
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = batch["labels"].numpy()
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  all_preds.extend(decoded_preds)
  all_labels.extend(decoded_labels)

result = metric.compute(predictions=all_preds, references=all_labels)
print(result)

In [None]:
decoded_preds

In [None]:
decoded_labels

# Testing

In [None]:
wrong_english=[
    "Dady hav'e eateing her foot",
    "DJ Sorryyouwastedyourmoneytobehere",
    "i used to like to swimming",
    "maybe we should organized a meetin with the people from unesco",
    "when are we goinge to start play football",
    "many a time rain fall in my city"
    ]
tokenized=tokenizer(
  wrong_english,
  padding="longest",
  max_length=MAX_LENGTH,
  truncation=True,
  return_tensors='tf'
)
out = model.generate(**tokenized, max_length=128)
print(out)

In [None]:
for i in range(len(wrong_english)):
  print(wrong_english[i]+"------------>"+tokenizer.decode(out[i], skip_special_tokens=True))

In [None]:
pretrained_model=AutoModelForSeq2SeqLM.from_pretrained(
    "juancavallotti/t5-base-gec"
)

In [None]:
wrong_english=[
    "Dady hav'e eateing her foot",
    "DJ Sorryyouwastedyourmoneytobehere",
    "i used to like to swimming",
    "maybe we should organized a meetin with the people from unesco",
    "when are we goinge to start play football",
    "many a time rain fall in my city",
   ]
tokenized=tokenizer(
  wrong_english,
  padding="longest",
  max_length=MAX_LENGTH,
  truncation=True,
  return_tensors='pt'
)
out=pretrained_model.generate(**tokenized, max_length=128)
print(out)

In [None]:
for i in range(len(wrong_english)):
  print(wrong_english[i]+"------------>"+tokenizer.decode(out[i], skip_special_tokens=True))