In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_cipher = pd.read_csv('/kaggle/input/cipher-eng-alphabet2/cipher_text.csv', header=None)
df_normal = pd.read_csv('/kaggle/input/cipher-eng-alphabet2/keys.csv', header=None)

In [4]:
X = df_cipher[0]
Y = df_normal[0]

In [7]:
X

0          Xyyc rd zoyttdz scny a psgsch iyodxn, rqdod wy...
1          Ikkdiz mhmdziuwhk tyhnub yiua (2011 aeu., RWI ...
2          Dpsr phr kjl dz dpj qjujmd ohmmsmx za Mjzmsur ...
3          Xh jffxnxrh, ndvav yrpria uvivh'n yrhwxhvf nr ...
4          Z hztwgx tstcsf qf z ulooqfe osfuqb tzx uezx a...
                                 ...                        
1534694    Ydrk mdrnr bcrbn jg mdr ecbik bcr eljtzrf, mdr...
1534695    Izii'i ayzd li eypk py oyjfmwrp pkw oswddlrai ...
1534696    HNYUZXFRYHZN: Wnfriwvyhzn ho v ywrcnhgfw pchrc...
1534697    Mgb umewt bqgbeqt qjtoggeptogf (MUBQ) efo mrcm...
1534698    Zsow nbjzs zsm VJL qoii uzzuts zlutpojh tbiiul...
Name: 0, Length: 1534699, dtype: object

In [9]:
X = X[:len(X)//2]
Y = Y[:len(Y)//2]

In [11]:
Y

0         a.nekmvgfb.u.trlhwip.ycsod
1         eh.usbdoa.nkpw...c.gt.ifrl
2         f..tmpwa.eldn.bhrsi.cu.gvo
3         sgbh.d.nrau..t.lyov.weficp
4         wnb.trlfvh.uc.p.o.emsdiyga
                     ...            
767344    thriocsblgv.n.ewupakf..d.y
767345    ftce..s.i..ugpnromdalyvwh.
767346    m.aftew..kdlgrhyun..pcsboi
767347    rcm.lvf.endsait...o.pgyu.x
767348    oid....ngy.ulht.rxpesa.m.c
Name: 0, Length: 767349, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [15]:
train_encodings = tokenizer(list(X_train), truncation=True, max_length = 256,padding=True)
train_labels = tokenizer(list(y_train), truncation=True, max_length = 256, padding=True)

In [16]:
test_encodings = tokenizer(list(X_test), truncation=True, max_length = 256,padding=True)
test_labels = tokenizer(list(y_test), truncation=True, max_length = 256,padding=True)

In [17]:
from torch.utils.data import Dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [19]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [21]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [22]:
%%capture
!pip install evaluate

In [None]:
import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",       
    per_device_train_batch_size=8,    
    per_device_eval_batch_size=8,      
    num_train_epochs=1,               
    learning_rate=2e-4,                
    weight_decay=0.01,                
    logging_dir='./logs',             
    logging_steps=1000,                
    save_steps=1000,                   
    save_total_limit=2,                 
    load_best_model_at_end=True,
    save_strategy = "epoch",
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()