In [1]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu


Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing Libraries

# Libraries to import the dataset into the code and split it
import pandas as pd
from sklearn.model_selection import train_test_split

# Libraries for PyTorch model building and data processing
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm  # Import tqdm for progress bar
import time

# Libraries for tokenization
from transformers import BertTokenizer
import re

# Evaluation metric
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

In [4]:
# checking if my Mac can leverage Apple’s Metal Performance Shaders (MPS)
print("MPS Available:", torch.backends.mps.is_available())

MPS Available: True


In [5]:
# Check if MPS is available and set the device
#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cpu")

In [6]:
# Preprocessing function to clean text
def preprocess_text(text):
    #Preprocess text by removing special characters.
    
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()  # Only stripping extra spaces

In [7]:
# Load the dataset
def load_lang8_data(file_path):
    #Loads and preprocesses the dataset from a CSV file.
    #Assumes the CSV file has two columns: '0' (original) and '1' (corrected).
    
    data = pd.read_csv(file_path)
    inputs = data['0'].apply(preprocess_text).tolist()  # Original text
    targets = data['1'].apply(preprocess_text).tolist()  # Corrected text
    return inputs, targets

In [8]:
# Load and preprocess the data
inputs, targets = load_lang8_data("Cleaned_Lang8.csv")

In [9]:
inputs[0]

'the president was standing in the front row and the every female enployees were surrounding him'

In [10]:
targets[0]

'the president was standing in the front row and all the female employees were surrounding him'

In [11]:
# Split the dataset into training and validation sets with shuffling (80% train, 20% validation)
train_inputs, valid_inputs, train_targets, valid_targets = train_test_split(
    inputs[:10000], targets[:10000], test_size=0.2, random_state=9, shuffle=True
)

In [12]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

In [13]:
# Tokenize data
def tokenize_data(sentences):
    #Tokenizes sentences using the BERT tokenizer and returns the tokenized tensors.
    
    return tokenizer(sentences, padding="max_length", truncation=True, return_tensors="pt")


In [14]:
# Tokenize the training and validation data directly
train_encodings = tokenize_data(train_inputs)
train_labels = tokenize_data(train_targets)
valid_encodings = tokenize_data(valid_inputs)
valid_labels = tokenize_data(valid_targets)

In [15]:
train_encodings

{'input_ids': tensor([[  101,  1165,   178,  ...,     0,     0,     0],
        [  101, 13280,  5171,  ...,     0,     0,     0],
        [  101,  1177,  1208,  ...,     0,     0,     0],
        ...,
        [  101,  7455,  1106,  ...,     0,     0,     0],
        [  101,   178,  2023,  ...,     0,     0,     0],
        [  101,  1196,  1374,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [16]:
train_labels

{'input_ids': tensor([[ 101, 1165,  178,  ...,    0,    0,    0],
        [ 101,  178, 1156,  ...,    0,    0,    0],
        [ 101, 1177, 1208,  ...,    0,    0,    0],
        ...,
        [ 101, 7455, 1106,  ...,    0,    0,    0],
        [ 101,  178, 1108,  ...,    0,    0,    0],
        [ 101, 1107,  170,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [17]:
# Model 1: Custom Sequence-to-Sequence Model (RNN-based Encoder-Decoder)
class Seq2SeqModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Seq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, tgt):
        embedded_src = self.embedding(src)
        encoder_output, (hidden, cell) = self.encoder(embedded_src)
        
        embedded_tgt = self.embedding(tgt)
        decoder_output, _ = self.decoder(embedded_tgt, (hidden, cell))
        
        output = self.fc(decoder_output)
        return output

seq2seq_model = Seq2SeqModel(vocab_size=len(tokenizer.vocab), embed_size=256, hidden_size=512)

In [18]:
# Model 2: Custom Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, ff_hidden_size, max_len, device="cpu"):
        super(TransformerModel, self).__init__()
        self.device = device  # Store device to move tensors accordingly
        self.embedding = nn.Embedding(vocab_size, embed_size).to(self.device)
        self.positional_encoding = self.get_positional_encoding(max_len, embed_size).to(self.device)
        
        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, ff_hidden_size)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers).to(self.device)
        
        decoder_layer = nn.TransformerDecoderLayer(embed_size, num_heads, ff_hidden_size)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers).to(self.device)
        
        self.fc = nn.Linear(embed_size, vocab_size).to(self.device)

    def forward(self, src, tgt):
        # Ensure input tensors are on the same device
        src = src.to(self.device)
        tgt = tgt.to(self.device)
        
        src = self.embedding(src) + self.positional_encoding[:src.size(1), :].to(self.device)
        tgt = self.embedding(tgt) + self.positional_encoding[:tgt.size(1), :].to(self.device)
        
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        
        return self.fc(output)

    def get_positional_encoding(self, max_len, embed_size):
        pe = torch.zeros(max_len, embed_size)
        for pos in range(max_len):
            for i in range(0, embed_size, 2):
                pe[pos, i] = np.sin(pos / (10000 ** (i / embed_size)))
                pe[pos, i + 1] = np.cos(pos / (10000 ** (i / embed_size)))
        return pe.unsqueeze(0)

transformer_model = TransformerModel(
    vocab_size=len(tokenizer.vocab), embed_size=128, num_heads=4, num_layers=2, ff_hidden_size=512, max_len=512, device=device
)




In [19]:
# Training Function
def train_model(model, train_encodings, train_labels, epochs=1, batch_size=64):

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    total_batches = (len(train_encodings['input_ids']) + batch_size - 1) // batch_size
    
    for epoch in range(epochs):
        progress_bar = tqdm(range(total_batches), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")

        epoch_loss = 0
        for i in progress_bar:
            # Create batches manually
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(train_encodings['input_ids']))
            
            # Extract the batch and move tensors to MPS
            input_ids = train_encodings['input_ids'][start_idx:end_idx].to(device)
            labels = train_labels['input_ids'][start_idx:end_idx].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, input_ids)  # Move all batch items to device

            # Ensure outputs and labels are on the same device
            outputs = outputs.to(device)
            labels = labels.to(device)

            # Calculate loss and perform backpropagation
            loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix({"batch_loss": loss.item()})
        
        average_epoch_loss = epoch_loss / total_batches
        print(f"Epoch {epoch + 1}/{epochs} completed with Average Loss: {average_epoch_loss:.4f}")



In [20]:
# Evaluating function
def evaluate_model(model, valid_encodings, valid_labels, tokenizer, batch_size=64):
    
    # Evaluates the model using BLEU score and prints the average BLEU score.
    model.eval()

    scores = []

    # Move encodings to device
    input_ids = valid_encodings["input_ids"].to(device)
    labels = valid_labels["input_ids"].to(device)

    # Evaluate in batches
    num_batches = (input_ids.size(0) + batch_size - 1) // batch_size
    with torch.no_grad():
        for i in tqdm(range(num_batches), desc="Evaluating"):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, input_ids.size(0))

            # Extract batch
            input_batch = input_ids[start_idx:end_idx]
            label_batch = labels[start_idx:end_idx]

            # Forward pass
            outputs = model(input_batch, input_batch)  # Both src and tgt are input_ids for simplicity
            
            # Move predictions to CPU for BLEU calculation
            predictions = outputs.argmax(dim=-1).cpu()
            label_batch = label_batch.cpu()

            # Decode predictions and labels
            output_texts = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
            label_texts = [tokenizer.decode(label, skip_special_tokens=True) for label in label_batch]

            # Calculate BLEU score for each sentence
            batch_scores = [
                sentence_bleu([label.split()], output.split())
                for label, output in zip(label_texts, output_texts)
            ]
            scores.extend(batch_scores)

    # Calculate the average BLEU score
    average_bleu = np.mean(scores)
    print("Average BLEU Score:", average_bleu)

In [21]:
# Move model to the device
seq2seq_model.to(device)

Seq2SeqModel(
  (embedding): Embedding(28996, 256)
  (encoder): LSTM(256, 512, batch_first=True)
  (decoder): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=28996, bias=True)
)

In [22]:
# Starting timer to measure time it takes to train the model
mod1train_starttime = time.time()

# Train the sequence-to-sequence model
train_model(seq2seq_model, train_encodings, train_labels, epochs=3)

# Ending timer to measure time it takes to train the model
mod1train_endtime = time.time()

# Measuring the time
mod1train_time = mod1train_endtime - mod1train_starttime
mod1train_hours = mod1train_time / 3600
mod1train_minutes = (mod1train_time % 3600) / 60
mod1train_seconds = mod1train_time % 60

Epoch 1/3: 100%|█████████| 125/125 [09:03<00:00,  4.35s/batch, batch_loss=0.151]


Epoch 1/3 completed with Average Loss: 0.4555


Epoch 2/3: 100%|█████████| 125/125 [09:05<00:00,  4.37s/batch, batch_loss=0.129]


Epoch 2/3 completed with Average Loss: 0.1449


Epoch 3/3: 100%|█████████| 125/125 [09:01<00:00,  4.33s/batch, batch_loss=0.115]

Epoch 3/3 completed with Average Loss: 0.1272





In [23]:
print (f"Time it took: {int(mod1train_hours)} hours, {int(mod1train_minutes)} minutes, and {int(mod1train_seconds)} seconds")

Time it took: 0 hours, 27 minutes, and 10 seconds


In [24]:
# Starting timer to measure time it takes to evaluate the model
mod1eval_starttime = time.time()

# Evaluate the sequence-to-sequence model
evaluate_model(seq2seq_model, valid_encodings, valid_labels, tokenizer)

# Ending timer to measure time it takes to evaluate the model
mod1eval_endtime = time.time()

# Measuring the time
mod1eval_time = mod1eval_endtime - mod1eval_starttime
mod1eval_hours = mod1eval_time / 3600
mod1eval_minutes = (mod1eval_time % 3600) / 60
mod1eval_seconds = mod1eval_time % 60

print (f"Time it took: {int(mod1eval_hours)} hours, {int(mod1eval_minutes)} minutes, and {int(mod1eval_seconds)} seconds")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating: 100%|███████████████████████████████| 32/32 [00:59<00:00,  1.87s/it]

Average BLEU Score: 0.07982769435203343
Time it took: 0 hours, 0 minutes, and 59 seconds





In [25]:
torch.save(seq2seq_model.state_dict(), "seq2seq_model_311024.pth")

In [26]:
# Move model to the device
transformer_model.to(device)

TransformerModel(
  (embedding): Embedding(28996, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True

In [27]:
# Starting timer to measure time it takes to train the model
mod2train_starttime = time.time()

# Train the transformer model
train_model(transformer_model, train_encodings, train_labels, epochs=3)

# Ending timer to measure time it takes to train the model
mod2train_endtime = time.time()

# Measuring the time
mod2train_time = mod2train_endtime - mod2train_starttime
mod2train_hours = mod2train_time / 3600
mod2train_minutes = (mod2train_time % 3600) / 60
mod2train_seconds = mod2train_time % 60

Epoch 1/3: 100%|█████████| 125/125 [06:42<00:00,  3.22s/batch, batch_loss=0.169]


Epoch 1/3 completed with Average Loss: 0.7614


Epoch 2/3: 100%|█████████| 125/125 [06:44<00:00,  3.24s/batch, batch_loss=0.142]


Epoch 2/3 completed with Average Loss: 0.1594


Epoch 3/3: 100%|█████████| 125/125 [06:37<00:00,  3.18s/batch, batch_loss=0.131]

Epoch 3/3 completed with Average Loss: 0.1421





In [28]:
print (f"Time it took: {int(mod2train_hours)} hours, {int(mod2train_minutes)} minutes, and {int(mod2train_seconds)} seconds")

Time it took: 0 hours, 20 minutes, and 5 seconds


In [29]:
# Starting timer to measure time it takes to evaluate the model
mod2eval_starttime = time.time()

# Evaluate the transformer model
evaluate_model(transformer_model, valid_encodings, valid_labels, tokenizer)

# Ending timer to measure time it takes to evaluate the model
mod2eval_endtime = time.time()

# Measuring the time
mod2eval_time = mod2eval_endtime - mod2eval_starttime
mod2eval_hours = mod2eval_time / 3600
mod2eval_minutes = (mod2eval_time % 3600) / 60
mod2eval_seconds = mod2eval_time % 60

print (f"Time it took: {int(mod2eval_hours)} hours, {int(mod2eval_minutes)} minutes, and {int(mod2eval_seconds)} seconds")

Evaluating: 100%|███████████████████████████████| 32/32 [00:31<00:00,  1.01it/s]

Average BLEU Score: 0.1285129634991028
Time it took: 0 hours, 0 minutes, and 31 seconds





In [30]:
torch.save(transformer_model.state_dict(), "transformer_model_311024.pth")

In [31]:
# prepare a text for prediction by tokenizing it and moving it into the MPS
def prepare_text(text, tokenizer, device):
    # Tokenize and convert to tensor
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    return encoding['input_ids'].to(device)

In [32]:
# generate the predicition of corrected text using the original
def generate_prediction(text, model, tokenizer, device="cpu"):

    # Prepare the input text
    input_ids = prepare_text(text, tokenizer, device)

    # Pass input through model
    with torch.no_grad():
        outputs = model(input_ids, input_ids)  # Using input_ids as both source and target
        predictions = outputs.argmax(dim=-1)

    # Decode predictions to text
    corrected_text = tokenizer.decode(predictions[0], skip_special_tokens=True)
    return corrected_text

#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
#seq2seq_model.to(device)
#transformer_model.to(device)

text = "These is a example sentence with bad grammar. How is you today? What do you do yesterday?"
corrected_seq2seq = generate_prediction(text, seq2seq_model, tokenizer, device)
corrected_transformer = generate_prediction(text, transformer_model, tokenizer, device)

print("Seq2Seq Correction:", corrected_seq2seq)
print("Transformer Correction:", corrected_transformer)


Seq2Seq Correction: the is a a the with with with with is you today do you do do do
Transformer Correction: is a i with a to and to is you not do you do yesterday the
