In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_correct = pd.read_csv('/kaggle/input/autocorrect-2-data/partial_ciphertext-v2.csv', header=None)

In [None]:
from datasets import load_dataset

ds = load_dataset("agentlans/high-quality-english-sentences")

In [None]:
df_plain = pd.DataFrame(ds['train']['text'])

In [None]:
df_correct = df_correct[:650000]
df_plain = df_plain[:650000]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_correct, df_plain, test_size=0.25, random_state=42)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
X_train1 = []

for i in X_train.values.tolist():
    X_train1.append(i[0])

In [None]:
y_train1 = []

for i in y_train.values.tolist():
    y_train1.append(i[0])

In [None]:
train_encodings = tokenizer(list(X_train1), truncation=True, max_length=256, padding=True)
train_labels = tokenizer(list(y_train1), truncation=True, max_length=256, padding=True)

In [None]:
X_test1 = []

for i in X_test.values.tolist():
    X_test1.append(i[0])
y_test1 = []

for i in y_test.values.tolist():
    y_test1.append(i[0])

In [None]:
test_encodings = tokenizer(list(X_test1), truncation=True, max_length = 256,padding=True)
test_labels = tokenizer(list(y_test1), truncation=True, max_length = 256,padding=True)

In [None]:
from torch.utils.data import Dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [None]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install evaluate

In [None]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

In [None]:
try:
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",       
        per_device_train_batch_size=8,    
        per_device_eval_batch_size=8,      
        num_train_epochs=1,               
        learning_rate=2e-4,                
        weight_decay=0.01,                
        logging_dir='./logs',             
        logging_steps=1000,                
        save_steps=1000,                   
        save_total_limit=2,                 
        load_best_model_at_end=True,
        save_strategy = "epoch",
        metric_for_best_model="accuracy",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
except:
    model.push_to_hub("Cipher-AI/AutoCorrect-EN-v2")
    tokenizer.push_to_hub("Cipher-AI/AutoCorrect-EN-v2")