In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the pre-trained T5-small model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# Define the task-specific head
task_prefix = "spelling correction: "


In [3]:
# Define the training data
training_data_samples = [
    ("Teh quik brown fox jumpd over teh lazy dog.", "The quick brown fox jumped over the lazy dog."),
    ("I hav two appels and three orngez.", "I have two apples and three oranges."),
    ("Thay went too the park.", "They went to the park."),
    ("Its a beautifull day.", "It's a beautiful day."),
    ("I cant wait for the weeknd.", "I can't wait for the weekend."),
    ("Thier house is very nice.", "Their house is very nice."),
    ("Definately going to the party.", "Definitely going to the party."),
    ("I dont no what to do.", "I don't know what to do."),
    ("Whos going to the movies?", "Who's going to the movies?"),
    ("I dint do my homework.", "I didn't do my homework."),
]

# I have a CSV file with the training data. The file has two columns: "incorrect" and "correct".
# I want to extract the "incorrect" column and use it as the input text, and the "correct" column as the target text.
# Write down the code such that I have incorrect and correct tuples in a single list.

import pandas as pd

# Load the CSV file
data = pd.read_csv('/Users/adel/adel/dev/playground/training data/auto-correct/corr_inc_training_data.csv')

# Extract the 'incorrect' and 'correct' columns and create tuples
training_data = list(zip(data['misspelling'], data['correct_word']))

# Now text_pairs contains tuples in the format (incorrect, correct)

training_data
training_data_samples



FileNotFoundError: [Errno 2] No such file or directory: '/Users/adel/adel/dev/playground/training data/auto-correct/corr_inc_training_data.csv'

In [9]:
# Preprocess the data
def preprocess_data(input_texts, target_texts):
    
    #To address the issue of potential non-string types and None values in your input_texts and target_texts
    inputs = [task_prefix + str(text) for text in input_texts if text is not None]
    target_texts = [str(text) for text in target_texts if text is not None]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length", return_tensors="pt")["input_ids"]
    return model_inputs, labels

# Prepare the training data
input_texts, target_texts = zip(*training_data)
train_inputs, train_labels = preprocess_data(input_texts, target_texts)


In [4]:
# Fine-tune the model - no batch processsing
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
num_epochs = 10

for epoch in range(num_epochs):
    outputs = model(input_ids=train_inputs["input_ids"], attention_mask=train_inputs["attention_mask"], labels=train_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# Evaluate the model
print("-----Starting model eval mode - ready to make predictions")
model.eval()
print("-----Finished model eval")

NameError: name 'model' is not defined

In [2]:
#fine tuning w/ batch procerssing
from torch.utils.data import DataLoader, TensorDataset

# Assuming train_inputs and train_labels are already tensors.
# If not, you should convert them:
# train_inputs_ids = torch.tensor(train_inputs["input_ids"])
# train_attention_mask = torch.tensor(train_inputs["attention_mask"])
# train_labels = torch.tensor(train_labels)

# Create a TensorDataset
dataset = TensorDataset(train_inputs["input_ids"], train_inputs["attention_mask"], train_labels)

# Create a DataLoader
batch_size = 100  # You can adjust the batch size according to your GPU memory
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


NameError: name 'train_inputs' is not defined

In [7]:
# Fine-tune the model
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
num_epochs = 10

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = batch

        # Zero the gradients on each iteration
        optimizer.zero_grad()

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss over the number of batches
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")


KeyboardInterrupt: 

In [None]:
# Inference method
def correct_text(input_text):
    input_ids = tokenizer.encode(task_prefix + input_text, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=512)[0]
    corrected_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    return corrected_text

In [None]:
#run inference
input_text = "I draaanks waatttere"
input_ids = tokenizer.encode(task_prefix + input_text, return_tensors="pt")
print("------Input IDs",input_ids)
output_ids = model.generate(input_ids, max_length=512)[0]
print("------Output IDs",output_ids)

corrected_text = tokenizer.decode(output_ids, skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Corrected: {corrected_text}")

------Input IDs tensor([[19519,    11, 19590, 11698,    10,    27,     3,  3515,     9,  5979,
             7,  8036,   144,    17,   449,    15,     1]])
------Output IDs tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0,