# GPT-2 Text Correction Model 


In [None]:
import os
import numpy as np
import pandas as pd

# Define dataset directory (changed from personal path)
DATA_DIR = "./"

# List all files in the dataset directory
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))

# Load CSV files from current directory
train_path = os.path.join(DATA_DIR, "train.csv")
validation_path = os.path.join(DATA_DIR, "validation.csv")

if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
    print("✅ Train dataset loaded:", train_df.shape)

if os.path.exists(validation_path):
    val_df = pd.read_csv(validation_path)
    print("✅ Validation dataset loaded:", val_df.shape)

In [None]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define paths (changed from personal path)
TOKENIZER_PATH = "./models/tokenizer"

# Ensure directory exists
os.makedirs(TOKENIZER_PATH, exist_ok=True)

# Load GPT-2 Tokenizer and set special tokens
tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>'
)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS as pad token

# Preprocessing function (EXACT same as original)
def preprocess(example):
    source = example['src']
    target = example['tgt']

    # Tokenize input
    tokenized = tokenizer(source, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
    target_tokenized = tokenizer(target, max_length=128, padding='max_length', truncation=True, return_tensors="pt")

    # Fix padding issue in labels: Replace `pad_token_id` with `-100`
    target_tokenized["input_ids"][target_tokenized["input_ids"] == tokenizer.pad_token_id] = -100

    return {
        'input_ids': tokenized['input_ids'].squeeze(0),  # Remove batch dim
        'attention_mask': tokenized['attention_mask'].squeeze(0),
        'labels': target_tokenized['input_ids'].squeeze(0),
    }

# Save tokenizer in local directory
tokenizer.save_pretrained(TOKENIZER_PATH)
print(f"Tokenizer saved at {TOKENIZER_PATH}")

In [None]:
from datasets import load_dataset

train_csv_path = f"{DATA_DIR}train.csv"

# Load dataset with correct path
dataset = load_dataset("csv", data_files={"train": train_csv_path})

tokenized_dataset = dataset['train'].map(preprocess, batched=True).with_format("torch")

In [None]:
tokenized_dataset

In [None]:
for item in tokenized_dataset:
    print(type(item['labels']))
    break

GPT Pretrained Model

In [None]:
from transformers import GPT2Config
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True)

In [None]:
for batch in train_loader:
    print(batch['input_ids'])

    input_ids = batch['input_ids']
    print(input_ids,type(input_ids))
    attention_mask = batch['attention_mask']
    print(attention_mask,type(attention_mask))
    labels = batch['labels']
    print(labels,type(labels))
    break

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.device_count())  # Should return the number of GPUs
print("MPS Available (Mac Only):", torch.backends.mps.is_available())

In [None]:
from tqdm import tqdm
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)


model.train()
for epoch in range(5):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        # Move tensors to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Debugging: Check for out-of-range values in `input_ids`
        vocab_size = model.config.vocab_size  # Get model vocab size
        if input_ids.max() >= vocab_size or input_ids.min() < 0:
            print(f"Out-of-range input_ids found! Max: {input_ids.max()}, Min: {input_ids.min()}")
            input_ids = input_ids.clamp(0, vocab_size - 1)  # Fix by clamping values

        # Debugging: Ensure input_ids are of type Long (int64)
        if input_ids.dtype != torch.long:
            input_ids = input_ids.long()

        # Debugging: Check for NaNs or Infs
        if torch.isnan(input_ids).any() or torch.isinf(input_ids).any():
            print("Warning: NaN or Inf found in input_ids!")
            input_ids = torch.nan_to_num(input_ids, nan=0.0)  # Replace NaNs with 0

        # Forward pass
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
save_directory = "./gpt_grammar_correction_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

In [None]:
def correct_sentence(model, tokenizer, input_sentence, device="cuda"):
    """
    Generate corrected sentence from an input sentence.

    Args:
        model: Trained transformer model.
        tokenizer: Tokenizer corresponding to the model.
        input_sentence (str): Sentence to correct.
        device (str): Device to run the inference on ("cuda" or "cpu").

    Returns:
        str: Corrected sentence.
    """
    model.eval()
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=128, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            num_beams=5,  # For diverse beam search
            early_stopping=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a sentence
input_sentence = "their going to the store"
corrected_sentence = correct_sentence(model, tokenizer, input_sentence, device=device)
print("Input Sentence:", input_sentence)
print("Corrected Sentence:", corrected_sentence)