In [1]:
import pandas as pd

df = pd.read_csv("spa.txt", sep="\t", header=None, names=["English", "Spanish", "Attribution"], encoding="utf-8")

print(df.head())


  English  Spanish                                        Attribution
0     Go.      Ve.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1     Go.    Vete.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2     Go.    Vaya.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3     Go.  Váyase.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4     Hi.    Hola.  CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [2]:
df.shape

(139013, 3)

In [3]:
l_eng = []
for i in df['English']:
    l_eng.append(len(i.split(" ")))
max(l_eng)

70

In [4]:
l_spa = []
for i in df['Spanish']:
    l_spa.append(len(i.split(" ")))
max(l_spa)

68

In [5]:
import re
en_sent = df['English']
sp_sent = df['Spanish']
en_sent = en_sent.apply(lambda x: x.lower())
sp_sent = sp_sent.apply(lambda x: x.lower())


In [6]:
en_sent = en_sent.apply(lambda x: re.sub(r'[,.!?¿¡]', '', x))
sp_sent = sp_sent.apply(lambda x: re.sub(r'[,.!?¿¡]', '', x))

In [None]:
sp_sent = sp_sent.apply(lambda x: '<sos> ' + x + ' <eos>')

In [8]:
from encoder import Encoder
en_encoder = Encoder(en_sent,max_len = 100)
sp_encoder = Encoder(sp_sent,max_len = 100)

In [9]:
en_encoder.build_vocab()
len(en_encoder.vocab)

10000

In [10]:
sp_encoder.build_vocab()
len(sp_encoder.vocab)

10000

In [11]:
en_encoded = []
for i in en_sent.values:
    en_encoded.append(en_encoder.encode_text(i))

In [12]:
sp_encoded = []
for i in sp_sent.values:
    sp_encoded.append(sp_encoder.encode_text(i))

In [13]:
from model import Transformer
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(10000,10000,128,2,6,2048,100,0.1).to(device)


In [15]:
import torch
import torch.nn as nn
import time
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0001, betas = (0.9,0.98),eps = 1e-9)

en_encoded = torch.tensor(en_encoded)
sp_encoded = torch.tensor(sp_encoded)
batch_size = 128

num_samples = en_encoded.size(0)
num_batches = num_samples // batch_size

for epoch in range(50):
    total_loss = 0
    start = time.time()
    for i in range(0, num_samples, batch_size):
        src_batch = en_encoded[i:i+batch_size]
        tgt_batch = sp_encoded[i:i+batch_size]
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)

        # Teacher forcing: decoder input is tgt[:, :-1]
        tgt_input = tgt_batch[:, :-1]

        # Model output
        output = model(src_batch, tgt_input)   # (batch, seq, vocab_size)

        # Target shifted by 1
        tgt_output = tgt_batch[:, 1:].contiguous().view(-1)

        # Reshape output for loss
        output = output.contiguous().view(-1, 10000)

        loss = criterion(output, tgt_output)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"epoch {epoch+1}, loss: {total_loss/num_batches:.4f}")
    print(f"epoch took:{time.time()-start}")


epoch 1, loss: 6.3287
epoch took:293.3469748497009
epoch 2, loss: 4.8964
epoch took:293.32932710647583
epoch 3, loss: 4.1861
epoch took:293.36329650878906
epoch 4, loss: 3.6976
epoch took:293.1811864376068
epoch 5, loss: 3.3235
epoch took:293.6599051952362
epoch 6, loss: 3.0243
epoch took:293.19161581993103
epoch 7, loss: 2.7803
epoch took:293.5543887615204
epoch 8, loss: 2.5731
epoch took:293.2085003852844
epoch 9, loss: 2.4014
epoch took:293.5266287326813
epoch 10, loss: 2.2512
epoch took:293.56275725364685
epoch 11, loss: 2.1240
epoch took:293.03693985939026
epoch 12, loss: 2.0082
epoch took:293.4919171333313
epoch 13, loss: 1.9098
epoch took:293.06230664253235
epoch 14, loss: 1.8181
epoch took:292.74614810943604
epoch 15, loss: 1.7378
epoch took:292.6371536254883
epoch 16, loss: 1.6623
epoch took:292.76828622817993
epoch 17, loss: 1.5991
epoch took:292.78819847106934
epoch 18, loss: 1.5363
epoch took:292.7403292655945
epoch 19, loss: 1.4799
epoch took:292.693772315979
epoch 20, los

In [17]:
torch.save(model.state_dict(),"model.pth")
import pickle
with open("en_encoder.pkl",'wb') as f:
    pickle.dump(en_encoder,f)
with open("sp_encoder.pkl",'wb') as f:
    pickle.dump(sp_encoder,f)