In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_cipher = pd.read_csv('/kaggle/input/dataset-cipher/cipher_english.csv', header=None)
df_normal = pd.read_csv('/kaggle/input/dataset-cipher/normal_english.csv', header=None)

In [2]:
X = df_cipher[0]
Y = df_normal[0]

In [3]:
X

0          Twwr gu qmwffuq nrcw d onxnrs zwmutc, glumu bw...
1          Occvog hlhvgoxulc fklnxw koxm (2011 max., TUO ...
2          Pqkd qld ces pi pqe tewezp olzzkzj ir Zeizkwd ...
3          As vwwajaes, jclgl qekedg ildls'j qesraslw je ...
4          O hobnwj bxbkxr ur o ysqqure qxryud boj yeoj c...
                                 ...                        
1534694    Pujh wujej cbjce as wuj obcnh cbj omaxvji, wuj...
1534695    Weww'w rseg aw psxq xs cszyhikx xqi cdiggakrw ...
1534696    JSEARVZCEJRS: Bszcybpejrs jt p ebcgsjqzb hgjcg...
1534697    Srp lsedg pwrpewg wygzrrejgzrm (SLPW) emz snus...
1534698    Cgbf msqcg cgj IQD zboo rccrng cdrnhbqa nsoord...
Name: 0, Length: 1534699, dtype: object

In [4]:
Y

0          Soon we dropped into a living forest, where co...
1          Annual population growth rate (2011 est., CIA ...
2          This has led to the recent banning of Neonics ...
3          In addition, these colors weren't confined to ...
4          A family member or a support person may stay w...
                                 ...                        
1534694    When these areas of the brain are blocked, the...
1534695    Sass's goal is both to document the dwellings ...
1534696    INTRODUCTION: Enucleation is a technique which...
1534697    Arc fault circuit interrupters (AFCI) use adva...
1534698    This month the DNR will attach tracking collar...
Name: 0, Length: 1534699, dtype: object

In [5]:
X = X[:len(X)//3]
Y = Y[:len(Y)//3]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [10]:
train_encodings = tokenizer(list(X_train), truncation=True, max_length = 256,padding=True)
train_labels = tokenizer(list(y_train), truncation=True, max_length = 256, padding=True)

In [11]:
test_encodings = tokenizer(list(X_test), truncation=True, max_length = 256,padding=True)
test_labels = tokenizer(list(y_test), truncation=True, max_length = 256,padding=True)

In [12]:
from torch.utils.data import Dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [14]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [16]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install evaluate

In [18]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",       
    per_device_train_batch_size=8,    
    per_device_eval_batch_size=8,      
    num_train_epochs=1,               
    learning_rate=2e-4,                
    weight_decay=0.01,                
    logging_dir='./logs',             
    logging_steps=1000,                
    save_steps=1000,                   
    save_total_limit=2,                 
    load_best_model_at_end=True,
    save_strategy = "epoch",
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()