In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import random
import re


In [19]:
text = ""
with open("data/data.txt", "r") as file:
    text = file.read()
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()  

sentences = [line.strip() for line in text.split('\n') if line.strip()]

print(f"Number of sentences: {len(sentences)}")
print(f"Example sentence: {sentences[0:2]}")



Number of sentences: 724
Example sentence: ['the quick brown fox jumps over the lazy dog', 'my mum tries to be cool by saying that she likes all the same things that i do']


In [20]:
def reverse_words_in_sentence(sentence):
    return " ".join(word[::-1] for word in sentence.split())
pairs = [(s, reverse_words_in_sentence(s)) for s in sentences]
print("Example pair:")
print(random.choice(pairs))

all_words = set()
for inp, out in pairs:
    all_words.update(inp.split())
    all_words.update(out.split())
word_list = sorted(all_words)

word2idx = {w: i+2 for i, w in enumerate(word_list)}  # reserve 0 and 1 for special tokens
word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
word2idx['<sos>'] = len(word2idx)
word2idx['<eos>'] = len(word2idx)
idx2word = {i: w for w, i in word2idx.items()}
print(f"Vocabulary size: {len(word2idx)}")
print("Sample words with indices:", list(word2idx.items())[:10])

Example pair:
('he decided to fake his disappearance to avoid jail', 'eh dediced ot ekaf sih ecnaraeppasid ot diova liaj')
Vocabulary size: 5208
Sample words with indices: [('001', 2), ('005', 3), ('01', 4), ('1', 5), ('10', 6), ('100', 7), ('1111', 8), ('1234', 9), ('17thcentury', 10), ('18', 11)]


In [21]:
def sentence_to_indices(sentence, word2idx):
    return [word2idx.get(word, word2idx['<unk>']) for word in sentence.split()]

def indices_to_sentence(indices, idx2word):
    return ' '.join(idx2word.get(idx, '<unk>') for idx in indices)

sample_sentence = pairs[0][0]
indices = sentence_to_indices(sample_sentence, word2idx)
print("Indices:", indices)

recovered_sentence = indices_to_sentence(indices, idx2word)
print("Recovered:", recovered_sentence)


Indices: [4428, 3262, 335, 1698, 2319, 3044, 4428, 2453, 1028]
Recovered: the quick brown fox jumps over the lazy dog


In [22]:
class ReverseWordsDataset(Dataset):
    def __init__(self, pairs, word2idx):
        self.pairs = pairs
        self.word2idx = word2idx

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        inp_sentence, out_sentence = self.pairs[idx]
        inp_indices = [self.word2idx.get(w, self.word2idx['<unk>']) for w in inp_sentence.split()]
        
        # Add <sos> and <eos> to target
        out_indices = [self.word2idx['<sos>']] + [self.word2idx.get(w, self.word2idx['<unk>']) for w in out_sentence.split()] + [self.word2idx['<eos>']]
        
        return torch.tensor(inp_indices, dtype=torch.long), torch.tensor(out_indices, dtype=torch.long)


In [23]:
def collate_fn(batch):
    # batch is list of tuples (input_tensor, output_tensor)
    inputs, outputs = zip(*batch)

    # Pad sequences to the max length in the batch (padding value = 0 for <pad>)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    outputs_padded = pad_sequence(outputs, batch_first=True, padding_value=0)

    return inputs_padded, outputs_padded

dataset = ReverseWordsDataset(pairs, word2idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)





In [24]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)  # [batch, src_len, emb_dim]
        outputs, hidden = self.rnn(embedded)
        return hidden  # [n_layers, batch, hid_dim]
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.embedding(input)  # [batch_size, 1, emb_dim]
        output, hidden = self.rnn(embedded, hidden)  # output: [batch_size, 1, hid_dim]
        prediction = self.fc_out(output.squeeze(1))  # [batch_size, output_dim]
        return prediction, hidden
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)
        input = trg[:, 0]  # First token (<sos>)

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else output.argmax(1)

        return outputs
    def translate(self, src_tensor, max_len, sos_idx, eos_idx):
        self.eval()
        with torch.no_grad():
            hidden = self.encoder(src_tensor)  # [n_layers, batch, hid_dim]
            input_token = torch.LongTensor([sos_idx]).to(self.device)  # start token

            translated_tokens = [sos_idx]

            for _ in range(max_len):
                output, hidden = self.decoder(input_token, hidden)  # output shape: [batch_size=1, output_dim]
                pred_token = output.argmax(1).item()
                translated_tokens.append(pred_token)

                if pred_token == eos_idx:
                    break

                input_token = torch.LongTensor([pred_token]).to(self.device)

            return translated_tokens


In [25]:
INPUT_DIM = len(word2idx)
OUTPUT_DIM = len(word2idx)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS)

model = Seq2Seq(encoder, decoder, device).to(device)

PAD_IDX = word2idx['<pad>']
print(PAD_IDX)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

0


In [32]:
import time
import torch
import torch.nn.functional as F

def train(model, dataloader, optimizer, criterion, device, clip=1.0):
    model.train()
    epoch_loss = 0

    for src, trg in dataloader:
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio=0.5)  # output shape: (batch_size, trg_len, vocab_size)

        # Skip the first token (<sos>) for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # (batch_size * (trg_len-1), vocab_size)
        trg = trg[:, 1:].reshape(-1)  # (batch_size * (trg_len-1))

        loss = criterion(output, trg)
        loss.backward()

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, teacher_forcing_ratio=0)  # no teacher forcing during evaluation

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


N_EPOCHS = 100

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, dataloader, optimizer, criterion, device)
    valid_loss = evaluate(model, dataloader, criterion, device)  # using same data as validation

    end_time = time.time()

    epoch_mins, epoch_secs = divmod(int(end_time - start_time), 60)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f}")


Epoch: 01 | Time: 0m 3s
	Train Loss: 4.242
	 Val. Loss: 4.194
Epoch: 02 | Time: 0m 2s
	Train Loss: 3.968
	 Val. Loss: 3.995
Epoch: 03 | Time: 0m 3s
	Train Loss: 3.661
	 Val. Loss: 3.684
Epoch: 04 | Time: 0m 3s
	Train Loss: 3.426
	 Val. Loss: 3.414
Epoch: 05 | Time: 0m 4s
	Train Loss: 3.130
	 Val. Loss: 3.076
Epoch: 06 | Time: 0m 4s
	Train Loss: 2.779
	 Val. Loss: 2.808
Epoch: 07 | Time: 0m 3s
	Train Loss: 2.424
	 Val. Loss: 2.541
Epoch: 08 | Time: 0m 3s
	Train Loss: 2.103
	 Val. Loss: 2.215
Epoch: 09 | Time: 0m 4s
	Train Loss: 1.771
	 Val. Loss: 1.848
Epoch: 10 | Time: 0m 3s
	Train Loss: 1.442
	 Val. Loss: 1.537
Epoch: 11 | Time: 0m 3s
	Train Loss: 1.176
	 Val. Loss: 1.155
Epoch: 12 | Time: 0m 4s
	Train Loss: 0.932
	 Val. Loss: 0.843
Epoch: 13 | Time: 0m 4s
	Train Loss: 0.691
	 Val. Loss: 0.584
Epoch: 14 | Time: 0m 3s
	Train Loss: 0.480
	 Val. Loss: 0.401
Epoch: 15 | Time: 0m 3s
	Train Loss: 0.348
	 Val. Loss: 0.278
Epoch: 16 | Time: 0m 3s
	Train Loss: 0.246
	 Val. Loss: 0.201
Epoch: 1

In [29]:
def translate_sentence(model, sentence, word2idx, idx2word, device, max_len=50):
    model.eval()

    tokens = sentence.lower().split()
    src_indices = [word2idx.get(token, word2idx['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(src_indices).unsqueeze(1).to(device)

    trg_indexes = model.translate(src_tensor, max_len, word2idx['<sos>'], word2idx['<eos>'])
    translated_tokens = [idx2word[i] for i in trg_indexes[1:-1]]  # skip <sos> and <eos>

    return ' '.join(translated_tokens)


In [33]:
# Suppose you have a source sentence as a list of tokens (words):
src_sentence = ["hello", "how", "are", "you"]

# Convert it to indices using your word2idx dictionary
src_indexes = [word2idx.get(tok, word2idx['<unk>']) for tok in src_sentence]

# Add batch dimension (1, seq_len) and convert to tensor
src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)  # shape: [1, seq_len]

max_len = 50  # maximum length of translation output
sos_idx = word2idx['<sos>']
eos_idx = word2idx['<eos>']

# Call translate
translated_indices = model.translate(src_tensor, max_len, sos_idx, eos_idx)

# Convert indices back to words (ignoring sos and eos tokens)
translated_words = [idx2word[idx] for idx in translated_indices if idx not in (sos_idx, eos_idx)]

print("Translated sentence:", ' '.join(translated_words))


Translated sentence: yffulf yffulf sesu segdew sesu sesu sesu sââãnosrep yrneh egaruoc ti
