In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")


In [None]:
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
        return item

    def __len__(self):
        return len(self.labels['input_ids'])  # Access 'input_ids'


In [None]:
!pip install -qU datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
import csv

# JFLEG Dataset

train_dataset = load_dataset("jfleg", split='validation[:]')
eval_dataset = load_dataset("jfleg", split='test[:]')

print(train_dataset)
print(train_dataset['sentence'][0])
print(train_dataset['corrections'][0])

README.md:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/755 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/748 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 755
})
So I think we can not live if old people could not find siences and tecnologies and they did not developped . 
['So I think we would not be alive if our ancestors did not develop sciences and technologies . ', 'So I think we could not live if older people did not develop science and technologies . ', 'So I think we can not live if old people could not find science and technologies and they did not develop . ', 'So I think we can not live if old people can not find the science and technology that has not been developed . ']


In [None]:
replacements = [
  (" .", "."),
  (" ,", ","),
  (" '", "'"),
  (" ?", "?"),
  (" !", "!"),
  (" :", "!"),
  (" ;", "!"),
  (" n't", "n't"),
  (" v", "n't"),
  ("2 0 0 6", "2006"),
  ("5 5", "55"),
  ("4 0 0", "400"),
  ("1 7-5 0", "1750"),
  ("2 0 %", "20%"),
  ("5 0", "50"),
  ("1 2", "12"),
  ("1 0", "10"),
  ('" ballast water', '"ballast water')
]

def remove_excess_spaces(text):
  for rep in replacements:
    text = text.replace(rep[0], rep[1])

  return text

def generate_csv(csv_path, dataset):
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["input", "target"])
        for case in dataset:
     	    # Adding the task's prefix to input
            input_text = "grammar: " + case["sentence"]
            input_text = remove_excess_spaces(input_text)
            for correction in case["corrections"]:
              correction = remove_excess_spaces(correction)
              # a few of the cases contain blank strings.
              if input_text and correction:
                writter.writerow([input_text, correction])

# Generate train and eval for JFLEG Dataset
!mkdir Dataset
!mkdir Dataset/JFLEG
generate_csv("Dataset/JFLEG/train.csv", train_dataset)
generate_csv("Dataset/JFLEG/eval.csv", eval_dataset)

In [None]:
# C4_200M Dataset

c4_dataset = load_dataset("liweili/c4_200m", streaming = True)

iterator = iter(c4_dataset['train'])

def c4_generate_csv(csv_path, iterator, num_examples):
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["input", "target"])
        for i in range(0,num_examples):
          data = next(iterator)
          input_text = "grammar: " + data["input"]
          input_text = remove_excess_spaces(input_text)
          correction = remove_excess_spaces(data["output"])
          if input_text and correction:
            writter.writerow([input_text, correction])

# Generate first 3500 examples from C4_200M dataset
!mkdir Dataset/C4_200M
c4_generate_csv("Dataset/C4_200M/c4data.csv", iterator, num_examples=3500)

README.md:   0%|          | 0.00/937 [00:00<?, ?B/s]

c4_200m.py:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

The repository for liweili/c4_200m contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/liweili/c4_200m.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
import pandas as pd

jfleg_train = pd.read_csv("Dataset/JFLEG/train.csv")
jfleg_eval = pd.read_csv("Dataset/JFLEG/eval.csv")
c4_data = pd.read_csv("Dataset/C4_200M/c4data.csv")

combined_data = pd.concat([jfleg_train, jfleg_eval, c4_data], ignore_index=True)

combined_data.to_csv("combined_data.csv", index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

combined_data = pd.read_csv("combined_data.csv")

train_data, eval_data = train_test_split(combined_data, test_size=0.2, random_state=42)

train_data.to_csv("train_data.csv", index=False)
eval_data.to_csv("eval_data.csv", index=False)

In [None]:
import pandas as pd

train_data = pd.read_csv("train_data.csv")
eval_data = pd.read_csv("eval_data.csv")

In [None]:
train_texts = train_data["input"].tolist()
train_labels = train_data["target"].tolist()

eval_texts = eval_data["input"].tolist()
eval_labels = eval_data["target"].tolist()

train_inputs = tokenizer(train_texts, return_tensors="pt", padding=True, truncation=True)
train_labels = tokenizer(train_labels, return_tensors="pt", padding=True, truncation=True)

eval_inputs = tokenizer(eval_texts, return_tensors="pt", padding=True, truncation=True)
eval_labels = tokenizer(eval_labels, return_tensors="pt", padding=True, truncation=True)

train_dataset = TextDataset(train_inputs, train_labels)
eval_dataset = TextDataset(eval_inputs, eval_labels)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=30,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,0.4038
1000,0.0649
1500,0.0599
2000,0.0561
2500,0.0544
3000,0.0516
3500,0.05
4000,0.0488
4500,0.0475
5000,0.0467


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Access 'input_ids'
  item = {key: torch.t

TrainOutput(global_step=7140, training_loss=0.07537648464117397, metrics={'train_runtime': 9815.3809, 'train_samples_per_second': 23.238, 'train_steps_per_second': 0.727, 'total_flos': 3.087011149774848e+16, 'train_loss': 0.07537648464117397, 'epoch': 30.0})

In [None]:
test_text = "can your help me please."  
test_inputs = tokenizer(test_text, return_tensors="pt").to(model.device)
outputs = model.generate(**test_inputs)

predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(predicted_text)

Can your help me please.


In [None]:
model.save_pretrained("grammar_correction_model")
tokenizer.save_pretrained("grammar_correction_model")

('grammar_correction_model/tokenizer_config.json',
 'grammar_correction_model/special_tokens_map.json',
 'grammar_correction_model/spiece.model',
 'grammar_correction_model/added_tokens.json')

In [None]:
!zip grammar_correction_model.zip grammar_correction_model/*

  adding: grammar_correction_model/added_tokens.json (deflated 83%)
  adding: grammar_correction_model/config.json (deflated 62%)
  adding: grammar_correction_model/generation_config.json (deflated 29%)
  adding: grammar_correction_model/model.safetensors (deflated 9%)
  adding: grammar_correction_model/special_tokens_map.json (deflated 85%)
  adding: grammar_correction_model/spiece.model (deflated 48%)
  adding: grammar_correction_model/tokenizer_config.json (deflated 94%)
