In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from torch.cuda.amp import autocast, GradScaler

# Custom Dataset class for grammar correction
class GrammarCorrectionDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=128, model_type='t5'):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.model_type = model_type
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])
        
        # Tokenize inputs
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        # Tokenize targets
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Function to train and evaluate T5 model
def train_and_evaluate(model, tokenizer, train_loader, val_loader, device, model_name, num_epochs=1, accum_steps=2):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scaler = GradScaler()
    
    # Function to calculate cosine similarity
    def calculate_cosine_similarity(predicted, target, model, tokenizer):
        with torch.no_grad():
            pred_emb = model.get_encoder()(input_ids=predicted.unsqueeze(0).to(device)).last_hidden_state.mean(dim=1).cpu().numpy()
            target_emb = model.get_encoder()(input_ids=target.unsqueeze(0).to(device)).last_hidden_state.mean(dim=1).cpu().numpy()
        return cosine_similarity(target_emb,pred_emb )[0][0]

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        total_train_acc = 0
        optimizer.zero_grad()
        for i, batch in enumerate(tqdm(train_loader, desc=f"{model_name} Training Epoch {epoch+1}")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss / accum_steps
                train_loss += loss.item() * accum_steps
            
            scaler.scale(loss).backward()
            
            if (i + 1) % accum_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            total_train_acc += (preds == labels).sum().item() / torch.numel(labels)
        
        avg_train_loss = train_loss / len(train_loader)
        avg_train_acc = total_train_acc / len(train_loader)
        print(f"{model_name} Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")
        print(f"{model_name} Epoch {epoch+1} Training Accuracy: {avg_train_acc:.4f}")
        
        # Validation
        model.eval()
        val_loss = 0
        total_val_acc = 0
        val_similarities = []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"{model_name} Validation Epoch {epoch+1}"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                with autocast():
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                
                val_loss += outputs.loss.item()
                
                preds = torch.argmax(outputs.logits, dim=-1)
                total_val_acc += (preds == labels).sum().item() / torch.numel(labels)
                
                for i in range(input_ids.size(0)):
                    pred_ids = model.generate(
                        input_ids[i].unsqueeze(0),
                        max_length=128,
                        num_beams=5,
                        early_stopping=True,
                        no_repeat_ngram_size=2
                    )
                    similarity = calculate_cosine_similarity(pred_ids[0], labels[i], model, tokenizer)
                    val_similarities.append(similarity)
        
        avg_val_loss = val_loss / len(val_loader)
        avg_val_acc = total_val_acc / len(val_loader)
        avg_val_similarity = np.mean(val_similarities) if val_similarities else 0
        print(f"{model_name} Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")
        print(f"{model_name} Epoch {epoch+1} Validation Accuracy: {avg_val_acc:.4f}")
        print(f"{model_name} Epoch {epoch+1} Validation Cosine Similarity: {avg_val_similarity:.4f}")
    
    # Save fine-tuned model
    model.save_pretrained(f"./fine_tuned_{model_name.lower()}")
    tokenizer.save_pretrained(f"./fine_tuned_{model_name.lower()}")

# Load dataset
n_rows = 10000
df = pd.read_csv('/kaggle/input/c4200m/C4_200M.tsv-00007-of-00010', sep='\t', nrows=n_rows)
df.columns = ["input", "target"]

# Display sample
print("Sample input:", df["input"][150])
print("Sample target:", df["target"][150])

# Initialize T5
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Prepare datasets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# T5 datasets
t5_train_dataset = GrammarCorrectionDataset(
    inputs=train_df["input"].tolist(),
    targets=train_df["target"].tolist(),
    tokenizer=t5_tokenizer,
    model_type='t5'
)
t5_val_dataset = GrammarCorrectionDataset(
    inputs=val_df["input"].tolist(),
    targets=val_df["target"].tolist(),
    tokenizer=t5_tokenizer,
    model_type='t5'
)

# Create data loaders
batch_size = 4
t5_train_loader = DataLoader(t5_train_dataset, batch_size=batch_size, shuffle=True)
t5_val_loader = DataLoader(t5_val_dataset, batch_size=batch_size)

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t5_model.to(device)

# Train and evaluate T5 model
train_and_evaluate(t5_model, t5_tokenizer, t5_train_loader, t5_val_loader, device, "T5", accum_steps=2)

# Test T5 model
def test_model(model, tokenizer, text, model_type, device):
    model.eval()
    input_text = text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected

# Test example
test_input = df["input"][550]
print("\nTest Example:")
print("Input:", test_input)
print("T5 Corrected Output:", test_model(t5_model, t5_tokenizer, test_input, 't5', device))
print("Target:", df["target"][550])

2025-05-10 10:36:42.825599: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746873403.005076      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746873403.056359      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sample input: Fiber Optic Cable Cleaning Kits The complete solution for precision end face - Fiber optic cable cleaning.
Sample target: Fiber Optic Cable Cleaning Kits The complete solution for precision end-face fiber optic cable cleaning.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  scaler = GradScaler()
  with autocast():
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
T5 Training Epoch 1: 100%|██████████| 2250/2250 [18:55<00:00,  1.98it/s]


T5 Epoch 1 Training Loss: nan
T5 Epoch 1 Training Accuracy: 0.7303


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autoc

T5 Epoch 1 Validation Loss: nan
T5 Epoch 1 Validation Accuracy: 0.6629
T5 Epoch 1 Validation Cosine Similarity: 0.1801

Test Example:
Input: Zapier, the serbice which helps to sync data between web apps through trigger and actions has announced the launch of developer platform.
T5 Corrected Output: .., the service which helps to sync data between web apps through triggers and actions has announced the launch of developer platform.. Zapier is a web service that helps in syncing data among web applications through. and announcedbice which whichs synchronize data through actions actions actions. announced developer......... of.. Developer platform will help developers to create
Target: Zapier, the service that helps sync data between web apps through trigger and actions has announced the launch of its developer platform.
