In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from tqdm import tqdm
import nltk
from nltk.translate.gleu_score import sentence_gleu
import numpy as np

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/j/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Load dataset
df = pd.read_csv("Cleaned_Lang8.csv", header=None)
df.columns = ["0", "1"]

# Split data into train and eval sets
train_df, eval_df = train_test_split(df, test_size=0.3, random_state=9, shuffle=True)

# Convert to lists
train_texts = train_df["0"].tolist()
train_labels = train_df["1"].tolist()
eval_texts = eval_df["0"].tolist()
eval_labels = eval_df["1"].tolist()

In [5]:
# Setting batch size and number of epochs of each run
batch_size = 16
epochs = 5 

In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")  # Replace "t5-base" with your desired model

# Tokenize and encode the training and evaluation sets
train_encodings = tokenizer(train_df["0"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
train_labels = tokenizer(train_df["1"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

eval_encodings = tokenizer(eval_df["0"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
eval_labels = tokenizer(eval_df["1"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

In [7]:
# Prepare TensorDatasets
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels["input_ids"])
eval_dataset = TensorDataset(eval_encodings["input_ids"], eval_encodings["attention_mask"], eval_labels["input_ids"])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

In [8]:
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# Move the model to the mps device
model.to("mps" if torch.backends.mps.is_available() else "cpu")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [9]:
# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [10]:
def train_model(model, train_loader, optimizer, epochs, startepoch=0, checkpoint_path=None):
    # Create the mps device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    # Set the mode of the model to training mode
    model.train()
    start_time = time.time()

    for epoch in range(startepoch, epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0

        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Average Loss: {avg_loss:.4f}")

        # Save a checkpoint after each epoch, if checkpoint path is provided
        if checkpoint_path:
            torch.save({
                'epoch': epoch + 1,  # Save the current epoch
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
            }, checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch + 1}")

    end_time = time.time()
    print(f"Training completed in {(end_time - start_time) // 3600}h {(end_time - start_time) % 3600 // 60}m {(end_time - start_time) % 60}s")


In [11]:
# Initial train of the model
train_model(model, train_loader, optimizer, epochs, 0, "t5model-16batch_checkpoint.pth")

Epoch 1/5


  0%|                                                  | 0/8750 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|█████████████████████████████████████| 8750/8750 [1:30:12<00:00,  1.62it/s]


Average Loss: 0.1068
Checkpoint saved at epoch 1
Epoch 2/5


100%|█████████████████████████████████████| 8750/8750 [1:29:54<00:00,  1.62it/s]


Average Loss: 0.0459
Checkpoint saved at epoch 2
Epoch 3/5


100%|█████████████████████████████████████| 8750/8750 [1:30:01<00:00,  1.62it/s]


Average Loss: 0.0389
Checkpoint saved at epoch 3
Epoch 4/5


100%|█████████████████████████████████████| 8750/8750 [1:30:42<00:00,  1.61it/s]


Average Loss: 0.0341
Checkpoint saved at epoch 4
Epoch 5/5


100%|█████████████████████████████████████| 8750/8750 [1:30:13<00:00,  1.62it/s]


Average Loss: 0.0304
Checkpoint saved at epoch 5
Training completed in 7.0h 31.0m 19.58087396621704s


In [12]:
def evaluate_model_with_gleu(model, eval_loader, tokenizer):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    model.to(device)
    model.eval()
    gleu_scores = []

    start_time = time.time()

    with torch.no_grad():
        for batch in tqdm(eval_loader):
            # Move tensors individually to device
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            # Generate predictions
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

            # Decode predictions and references
            predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            references = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

            # Calculate GLEU for each sentence
            for pred, ref in zip(predictions, references):
                ref_tokens = [ref.split()]
                pred_tokens = pred.split()
                gleu_score = sentence_gleu(ref_tokens, pred_tokens)
                gleu_scores.append(gleu_score)

    avg_gleu_score = np.mean(gleu_scores)
    end_time = time.time()

    print(f"Evaluation completed in {(end_time - start_time) // 3600}h {(end_time - start_time) % 3600 // 60}m {(end_time - start_time) % 60}s")
    print(f"Average GLEU Score: {avg_gleu_score:.4f}")


In [13]:
# Evaluate the model
evaluate_model_with_gleu(model, eval_loader, tokenizer)

100%|█████████████████████████████████████| 3751/3751 [1:53:33<00:00,  1.82s/it]

Evaluation completed in 1.0h 53.0m 33.718159914016724s
Average GLEU Score: 0.7354





In [14]:
def predict(texts, model, tokenizer, max_length=128):
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Device configuration
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    model.to(device)
    
    # Tokenize the input text(s)
    encodings = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

    # Decode predictions to text
    predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return predictions

In [15]:
texts = ["These is a example sentence that needs correction.", 
         "Another sentence for tests gramar corection.",
         "Hello! my name are Jassim. How aree you doing tooday?",
         "I like to code with python programming language so many. He is my favorite language."]
predicted_texts = predict(texts, model, tokenizer)

for i, prediction in enumerate(predicted_texts):
    print(f"Original: {texts[i]}")
    print(f"Corrected: {prediction}")

Original: These is a example sentence that needs correction.
Corrected: this is an example sentence that needs correction.
Original: Another sentence for tests gramar corection.
Corrected: another sentence for grammatical correction.
Original: Hello! my name are Jassim. How aree you doing tooday?
Corrected: hello! my name is Jassim. how are you doing today?
Original: I like to code with python programming language so many. He is my favorite language.
Corrected: i like to code with python so much. it is my favorite language.
