In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

from tqdm import tqdm, trange

from indicnlp.tokenize import indic_tokenize

In [54]:
MAX_LENGTH = 40 # maximum length of sentences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden
    
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights



class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, index):
        return self.source_sentences[index], self.target_sentences[index]


def pad_sequence(sequence, pad_value):
  # Padding function to add pad_value to sequences until they reach max_len
  for i in range(MAX_LENGTH - len(sequence)):
      sequence.append(pad_value)
  return sequence  


In [2]:

# get tokens from pre-processed files
with open('data/eng_tokens.txt', 'r') as f:
    tokens = f.readlines()
eng_tokens = []
for x in trange(len(tokens), desc='get english tokens...'):
    eng_tokens.append(tokens[x].strip('\n').split(' '))
print(eng_tokens[0])

with open('data/kan_tokens.txt', 'r', encoding='utf-8') as f:
    tokens = f.readlines()
kan_tokens = []
for x in trange(len(tokens), desc='get kannada tokens...'):
    kan_tokens.append(tokens[x].strip('\n').split(' '))
print(kan_tokens[0])

# get vocabulary
eng_vocab = set()
kan_vocab = set()
for i in eng_tokens:
  for j in i:
    eng_vocab.add(j)
eng_vocab = list(eng_vocab)

for i in kan_tokens:
  for j in i:
    kan_vocab.add(j)
kan_vocab = list(kan_vocab)

print(eng_vocab[:10])
print(kan_vocab[:10])

get english tokens...: 100%|██████████| 4093524/4093524 [00:44<00:00, 91227.85it/s] 


['<s>', 'Hes', 'a', 'scientist', '.', '</s>']


get kannada tokens...: 100%|██████████| 4093524/4093524 [04:59<00:00, 13662.89it/s] 


['<s>', 'ಇವರು', 'ಸಂಶೋಧಕ', 'ಸ್ವಭಾವದವರು', '.', '</s>']
['ACHARYA', 'Roulette', 'Bamleshwari', 'exclusivist', 'staging', 'Fairtrade', 'Teat', 'Jugaad', 'Ananantnag', '32.352']
['ಶೇಖರಿಸಿ', 'ಕಾರ್ಟಿಂಗ್', 'ಎಬಿಸಿಡಿಯನ್ನು', 'ನವಲ್\u200b', 'ಹೇಳಿದ್ದರೇ', 'ಬಿಟ್ಟುಕೊಡಿರಿ', 'ದೇಶದ್ರೋಹಕ್ಕೆ', 'ನನಗೆನೂ', 'ಡಿಬಗ್', 'ಕೇಂದ್ರದಿAದ']


In [4]:
# get index lists
eng_word2index = {word: index for index, word in enumerate(eng_vocab)}
kan_word2index = {word: index for index, word in enumerate(kan_vocab)}



In [14]:
print(kan_word2index['</s>'])

915327


In [16]:

eng_indices = [[eng_word2index[word] for word in sent] for sent in eng_tokens] 
kan_indices = [[kan_word2index[word] for word in sent] for sent in kan_tokens]


In [21]:
print(eng_indices[0])
print(eng_word2index.keys())


[39465, 5492, 162182, 227654, 142844, 149941]


TypeError: 'dict_keys' object is not subscriptable

In [22]:
print(eng_indices[1])

[39465, 284404, 181139, 5473, 72987, 75228, 111596, 3203, 217934, 283968, 170018, 148156, 43673, 119737, 74278, 105664, 283968, 81580, 256019, 178387, 176908, 197455, 14698, 161885, 85055, 149941]


In [26]:

eng_indices1 = [pad_sequence(sent, eng_word2index['</s>']) for sent in eng_indices[:10]]
kan_indices1 = [pad_sequence(sent, kan_word2index['</s>']) for sent in kan_indices[:10]]


In [27]:

batch_size = 32
print("kan",kan_indices1[:10])
print("eng",eng_indices1[:10])

dataset = TranslationDataset(kan_indices1[:10], eng_indices1[:10])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for batch in dataloader:
    print(batch)
    break



kan [[786989, 457540, 492223, 207483, 537984, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327], [786989, 315690, 990778, 1421846, 350639, 223991, 179123, 1371837, 173821, 1082666, 428419, 95888, 315690, 315690, 977259, 1249782, 970673, 697608, 677901, 1238968, 315690, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327], [786989, 342898, 87511, 567903, 1049205, 537984, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327], [786989, 68855, 754679, 6

In [56]:


lr = 0.01
epochs = 5
hidden_size = 100
encoder = EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)
optimizer = Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=lr)
criterion = nn.CrossEntropyLoss()


In [88]:

print("training begin.")

# Training loop
MODEL_SAVE_INTERVAL = 100 # save the model every so often
losses = [] # average loss per epoch
bar = trange(epochs, desc=f'')
for epoch in bar:
    epoch_loss = 0
    for i, (kan_batch,eng_batch) in enumerate(dataloader): # TO-DO - Need to pad the data
        eng_batch = torch.stack(eng_batch, dim=1)
        kan_batch = torch.stack(kan_batch, dim=1)
        print(kan_batch[0])

        eng_batch = eng_batch.to(device)
        kan_batch = kan_batch.to(device)

        optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(kan_batch)
        decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden, target_tensor=eng_batch)

        loss = criterion(decoder_outputs.view(-1, len(eng_vocab)), eng_batch.view(-1))
        epoch_loss += (loss.item()/len(eng_batch))
        loss.backward()
        optimizer.step()
    losses.append(epoch_loss)
    bar.set_description(f'loss: {epoch_loss}')

    if epoch % MODEL_SAVE_INTERVAL == 0:
        torch.save(encoder.state_dict(), f"encoder.pt")
        torch.save(decoder.state_dict(), f"decoder.pt")

torch.save(encoder.state_dict(), f"encoder_final.pt")
torch.save(decoder.state_dict(), f"decoder_final.pt")


training begin.


  0%|          | 0/5 [00:00<?, ?it/s]

tensor([ 786989, 1045684,  997823, 1462632,  716675,  307981,  888138,  357443,
         611688,  537984,  915327,  915327,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327])


loss: 0.1237074613571167:  20%|██        | 1/5 [00:06<00:25,  6.44s/it]

tensor([ 786989,  247647, 1067324, 1160534,   64319, 1120361,  157890,  827993,
        1197435, 1192319,  311388,  537984,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327,
         915327,  915327,  915327,  915327,  915327,  915327,  915327,  915327])


loss: 0.1237074613571167:  20%|██        | 1/5 [00:09<00:39,  9.91s/it]


KeyboardInterrupt: 

In [97]:
def translate_sentence(input_sentence):
    # Preprocess input sentence
    input_sequence = indic_tokenize.trivial_tokenize(input_sentence)
    input_sequence.insert(0,"<s>")
    input_sequence.append("</s>")
    vector =  [kan_word2index[word] for word in input_sequence]
    input_sequence = pad_sequence(vector, kan_word2index['</s>'])
 
    print(torch.tensor(input_sequence))
    encoder_outputs, encoder_hidden = encoder(torch.tensor(input_sequence, device=device).view(-1, 1))
    print(encoder_outputs)
    decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden)
 
    _, topi = decoder_outputs.topk(1)
    decoded_ids = topi.squeeze()
    print(decoded_ids)

    decoded_words = []
    for idx in decoded_ids:
        if idx.item() == '</s>':
            decoded_words.append('</s>')
            break
        decoded_words.append(idx.item())

    return decoded_words

In [98]:
out = translate_sentence('ಇವರು ಸಂಶೋಧಕ ಸ್ವಭಾವದವರು .')
out

tensor([786989, 457540, 492223, 207483, 537984, 915327, 915327, 915327, 915327,
        915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327,
        915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327,
        915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327, 915327,
        915327, 915327, 915327, 915327])
tensor([[[-0.5865,  0.1517, -0.9291,  ..., -0.0988,  0.2387,  0.8958]],

        [[-0.8166, -0.2477, -0.3299,  ...,  0.0995,  0.1259,  0.2477]],

        [[ 0.0845, -0.5271, -0.1066,  ..., -0.0349,  0.1959,  0.3143]],

        ...,

        [[ 0.3932, -0.7750, -0.9279,  ..., -0.0022, -0.2628,  0.9407]],

        [[ 0.4274, -0.8297, -0.9421,  ..., -0.0028, -0.3001,  0.9565]],

        [[ 0.4405, -0.8124, -0.9086,  ..., -0.0042, -0.3950,  0.9217]]],
       grad_fn=<TransposeBackward1>)
tensor([[ 39465, 149941, 149941,  ..., 149941, 149941, 149941],
        [ 39465, 149941, 149941,  ..., 149941, 149941, 149941],
        [ 39465, 14994

RuntimeError: a Tensor with 40 elements cannot be converted to Scalar