In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


training_data = []
with open("training_data.txt", "r", encoding="utf-8") as f:
    for line in f:
        input_text, output_text = line.strip().split("|||")
        training_data.append({"input": input_text, "output": output_text})


class LabelCorrectionDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return {"input": self.data[idx]["input"], "output": self.data[idx]["output"]}

dataset = LabelCorrectionDataset(training_data)

# Tokenize data
def tokenize_batch(batch):
    inputs = tokenizer([b["input"] for b in batch], padding=True, truncation=True, return_tensors="pt")
    outputs = tokenizer([b["output"] for b in batch], padding=True, truncation=True, return_tensors="pt")
    return {"input_ids": inputs.input_ids, "labels": outputs.input_ids}

# Fine-tune
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(3):
    for i in range(0, len(dataset), 4):
        batch = dataset[i:i+4]
        tokenized = tokenize_batch(batch)
        outputs = model(input_ids=tokenized["input_ids"], labels=tokenized["labels"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch}, Batch {i//4}, Loss: {loss.item()}")


model.save_pretrained("label_correction_model")
tokenizer.save_pretrained("label_correction_model")