In [1]:
import pandas as pd

df = pd.read_csv("spa-eng/spa.txt", sep="\t", header=None, names=["English", "Spanish", "Attribution"], encoding="utf-8")

print(df.head())


  English  Spanish                                        Attribution
0     Go.      Ve.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1     Go.    Vete.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2     Go.    Vaya.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3     Go.  Váyase.  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4     Hi.    Hola.  CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [2]:
df.shape

(139013, 3)

In [3]:
l_eng = []
for i in df['English']:
    l_eng.append(len(i.split(" ")))
max(l_eng)

70

In [4]:
l_spa = []
for i in df['Spanish']:
    l_spa.append(len(i.split(" ")))
max(l_spa)

68

In [5]:
import re
en_sent = df['English']
sp_sent = df['Spanish']
en_sent = en_sent.apply(lambda x: x.lower())
sp_sent = sp_sent.apply(lambda x: x.lower())


In [6]:
en_sent = en_sent.apply(lambda x: re.sub(r'[,.!?¿¡]', '', x))
sp_sent = sp_sent.apply(lambda x: re.sub(r'[,.!?¿¡]', '', x))

In [7]:
sp_sent = sp_sent.apply(lambda x: '<sos>' + x + '<eos>')

In [8]:
from encoder import Encoder
en_encoder = Encoder(en_sent,max_len = 100)
sp_encoder = Encoder(sp_sent,max_len = 100)

In [9]:
en_encoder.build_vocab()
len(en_encoder.vocab)

10000

In [10]:
sp_encoder.build_vocab()
len(sp_encoder.vocab)

10000

In [11]:
en_encoded = []
for i in en_sent.values:
    en_encoded.append(en_encoder.encode_text(i))

In [12]:
sp_encoded = []
for i in sp_sent.values:
    sp_encoded.append(sp_encoder.encode_text(i))

In [13]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from model import Transformer
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(10000,10000,128,2,6,2048,100,0.1).to(device)


In [14]:
import torch
import torch.nn as nn
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0001, betas = (0.9,0.98),eps = 1e-9)

en_encoded = torch.tensor(en_encoded)
sp_encoded = torch.tensor(sp_encoded)
batch_size = 128

num_samples = en_encoded.size(0)
num_batches = num_samples // batch_size

for epoch in range(10):
    total_loss = 0

    for i in range(0, num_samples, batch_size):
        src_batch = en_encoded[i:i+batch_size]
        tgt_batch = sp_encoded[i:i+batch_size]
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)

        # Teacher forcing: decoder input is tgt[:, :-1]
        tgt_input = tgt_batch[:, :-1]

        # Model output
        output = model(src_batch, tgt_input)   # (batch, seq, vocab_size)

        # Target shifted by 1
        tgt_output = tgt_batch[:, 1:].contiguous().view(-1)

        # Reshape output for loss
        output = output.contiguous().view(-1, 10000)

        loss = criterion(output, tgt_output)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(f"progress:{i} / {num_samples}")

    print(f"epoch {epoch+1}, loss: {total_loss/num_batches:.4f}")


progress:0 / 139013
progress:128 / 139013
progress:256 / 139013
progress:384 / 139013
progress:512 / 139013
progress:640 / 139013
progress:768 / 139013
progress:896 / 139013
progress:1024 / 139013
progress:1152 / 139013
progress:1280 / 139013
progress:1408 / 139013
progress:1536 / 139013
progress:1664 / 139013
progress:1792 / 139013
progress:1920 / 139013
progress:2048 / 139013
progress:2176 / 139013
progress:2304 / 139013
progress:2432 / 139013
progress:2560 / 139013
progress:2688 / 139013
progress:2816 / 139013
progress:2944 / 139013
progress:3072 / 139013
progress:3200 / 139013
progress:3328 / 139013
progress:3456 / 139013
progress:3584 / 139013
progress:3712 / 139013
progress:3840 / 139013
progress:3968 / 139013
progress:4096 / 139013
progress:4224 / 139013
progress:4352 / 139013
progress:4480 / 139013
progress:4608 / 139013
progress:4736 / 139013
progress:4864 / 139013
progress:4992 / 139013
progress:5120 / 139013
progress:5248 / 139013
progress:5376 / 139013
progress:5504 / 13901

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict,"model.pth")
import pickle
with open("en_encoder.pkl",'wb') as f:
    pickle.dump(en_encoder,f)
with open("sp_encoder",'wb') as f:
    pickle.dump(sp_encoder,f)