## Loading data

In [17]:
with open('data/aksharantar_sampled/tam/tam_train.csv') as f:
    data_pairs = f.readlines()
data_given = [pair.split(',')[0].strip().lower() for pair in data_pairs]
data_target = [pair.split(',')[1].strip('\n').strip() for pair in data_pairs]
len(data_given), len(data_target)

(51200, 51200)

## Building the alphabet

In [27]:
class Alphabet():
    def __init__(self) -> None:
        self.letter_to_index = {}
        self.index_to_letter = ['SOW', 'EOW', 'UNK']
        self.letter_count = 3
    
    def addLetter(self, letter: str) -> None:
        if letter not in self.letter_to_index:
            self.letter_to_index[letter] = self.letter_count
            self.index_to_letter.append(letter)
            self.letter_count += 1

In [33]:
eng_alphabet, tam_alphabet = Alphabet(), Alphabet()
for word in data_given:
    for letter in word:
        eng_alphabet.addLetter(letter)
for word in data_target:
    for letter in word:
        tam_alphabet.addLetter(letter)
print(eng_alphabet.letter_count, tam_alphabet.letter_count)

29 49


## Seq2Seq model

In [54]:
import torch
from torch import nn
from torch.functional import F

class Encoder(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        cell_type: nn.Module = nn.RNN,
        num_layers: int = 1
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)
        self.encoder = cell_type(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
    
    def forward(self, x, hidden):
        output = self.embedding(x).reshape(1, 1, -1) 
        output, hidden = self.encoder(output, hidden)
        return output, hidden
 

In [55]:
class Decoder(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        output_size: int,
        cell_type: nn.Module = nn.RNN,
        num_layers: int = 1
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=hidden_size)
        self.decoder = cell_type(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)
        self.out = nn.Linear(in_features=hidden_size, out_features=output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden):
        output = self.embedding(x).reshape(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.decoder(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
