In [1]:
!pip install torch==2.0.1

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install nltk
!pip install indic_nlp_library==0.92

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [15]:
import nltk

In [16]:
print(nltk)

<module 'nltk' from '/shared/centos7/anaconda3/2021.05/lib/python3.8/site-packages/nltk/__init__.py'>


In [17]:
print('hello world')

hello world


In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

from tqdm import tqdm, trange

from indicnlp.tokenize import indic_tokenize

In [69]:
torch.cuda.empty_cache()
torch.cuda.memory_summary()



In [70]:
MAX_LENGTH = 17 # maximum length of sentences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden
    
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights



class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, index):
        return self.source_sentences[index], self.target_sentences[index]


def pad_sequence(sequence, pad_value):
    # Padding function to add pad_value to sequences until they reach max_len
    for i in range(MAX_LENGTH - len(sequence)):
        sequence.append(pad_value)
    return sequence  

In [71]:
print(device)

cuda


In [72]:

# get tokens from pre-processed files
with open('eng_tokens.txt', 'r') as f:
    tokens = f.readlines()
eng_tokens = []
for x in trange(len(tokens), desc='get english tokens...'):
    eng_tokens.append(tokens[x].strip('\n').split(' '))
print(eng_tokens[0])

with open('kan_tokens.txt', 'r', encoding='utf-8') as f:
    tokens = f.readlines()
kan_tokens = []
for x in trange(len(tokens), desc='get kannada tokens...'):
    kan_tokens.append(tokens[x].strip('\n').split(' '))
print(kan_tokens[0])

# get vocabulary
eng_vocab = set()
kan_vocab = set()
for i in eng_tokens:
    for j in i:
        eng_vocab.add(j)
eng_vocab = list(eng_vocab)

for i in kan_tokens:
    for j in i:
        kan_vocab.add(j)
kan_vocab = list(kan_vocab)

print(eng_vocab[:10])
print(kan_vocab[:10])

get english tokens...: 100%|██████████| 4093524/4093524 [00:21<00:00, 191258.27it/s]


['<s>', 'Hes', 'a', 'scientist', '.', '</s>']


get kannada tokens...: 100%|██████████| 4093524/4093524 [00:06<00:00, 639107.52it/s]


['<s>', 'ಇವರು', 'ಸಂಶೋಧಕ', 'ಸ್ವಭಾವದವರು', '.', '</s>']
['Fakira', 'Newwz', 'Lank', 'Soldevanahalli', 'dualSIM', 'Yafan', 'Rushikonda', 'Chittade', 'dewed', 'motif']
['ಒಬ್ಬಂಟಿಯಾಗಿರಲಿಲ್ಲ’', 'ಡ್ರೈವಿನಲ್ಲಿನ', 'ವಿವಿಧೋದ್ದೇಶ', 'ಡ್ಯಾಮಿಯೆನ್', 'ಕನೆಕ್ಟಿವಿಟಿಗಳನ್ನು', 'ನಾಯ್ಕೋಡಿ', '2018’\xa0', 'ಸಾರಿಕೊಳ್ಳುತ್ತಿದ್ದರು', 'ಲಗತ್ತಿಸುವುದು', 'ಆತ್ಮಗೌರವವುಳ್ಳವರಾಗಿದ್ದಾರೆ']


In [73]:
# get index lists
eng_word2index = {word: index for index, word in enumerate(eng_vocab)}
kan_word2index = {word: index for index, word in enumerate(kan_vocab)}

In [74]:
print(kan_word2index['</s>'])

1265648


In [75]:
eng_indices = [[eng_word2index[word] for word in sent] for sent in eng_tokens] 
kan_indices = [[kan_word2index[word] for word in sent] for sent in kan_tokens]

In [76]:
print(len(eng_indices))

4093524


In [77]:
# truncate all sentences to length 40. or else the dataloader will not work
# eng_indices_padded = [pad_sequence(sent, eng_word2index['</s>']) if len(sent) <= MAX_LENGHT for sent in eng_indices]
eng_indices_padded = [pad_sequence(sent, eng_word2index['</s>']) if len(sent) <= MAX_LENGTH else None for sent in eng_indices]
eng_indices_padded = [sent for sent in eng_indices_padded if sent is not None]
kan_indices_padded = [pad_sequence(sent, kan_word2index['</s>']) if len(sent) <= MAX_LENGTH else None for sent in kan_indices]
kan_indices_padded = [sent for sent in kan_indices_padded if sent is not None]

In [78]:
eng_indices_padded = []
kan_indices_padded = []

for eng_sent, kan_sent in zip(eng_tokens, kan_tokens):
    if len(eng_sent) <= MAX_LENGTH and len(kan_sent) <= MAX_LENGTH:
        eng_indices_padded.append(pad_sequence([eng_word2index[word] for word in eng_sent], eng_word2index['</s>']))
        kan_indices_padded.append(pad_sequence([kan_word2index[word] for word in kan_sent], kan_word2index['</s>']))
    else:
        continue


# Ensure that both lists have the same length
 

In [79]:

print(len(eng_indices_padded) == len(kan_indices_padded), f"Number of English sentences: {len(eng_indices_padded)}, Number of Kannada sentences: {len(kan_indices_padded)}")


True Number of English sentences: 3260760, Number of Kannada sentences: 3260760


In [80]:
print(len(eng_indices_padded))
print(eng_indices_padded[0])

3260760
[2241, 239009, 91612, 199834, 314685, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848]


In [81]:
eng_indices1 = eng_indices_padded[:5000]
kan_indices1 = kan_indices_padded[:5000]

In [82]:
batch_size = 24

dataset = TranslationDataset(kan_indices1, eng_indices1)
print(dataset[0])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



([743458, 1008859, 345974, 565182, 1478997, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648, 1265648], [2241, 239009, 91612, 199834, 314685, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848, 131848])


In [83]:
lr = 0.01
epochs = 200
hidden_size = 128
encoder = EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)
optimizer = Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:


if os.path.exists("encoder_final.pt") and os.path.exists("decoder_final.pt"):
    encoder =  EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)  # Replace YourEncoderModelClass with the actual class of your encoder model
    decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)

    encoder.load_state_dict(torch.load("encoder_final.pt"))
    decoder.load_state_dict(torch.load("decoder_final.pt"))
    print("pre-trained models loaded.")

   
    
print("training begin.")
import time

# Training loop
MODEL_SAVE_INTERVAL = 10 # save the model every so oftens
losses = [] # average loss per epoch
bar = trange(epochs, desc=f'')
for epoch in bar:
    epoch_loss = 0
    for i, (kan_batch,eng_batch) in enumerate(dataloader): # TO-DO - Need to pad the data
        time_start = time.time()
        eng_batch = torch.stack(eng_batch, dim=1)
        kan_batch = torch.stack(kan_batch, dim=1)

        eng_batch = eng_batch.to(device)
        kan_batch = kan_batch.to(device)

        optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(kan_batch)
        decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden, target_tensor=eng_batch)

        loss = criterion(decoder_outputs.view(-1, len(eng_vocab)), eng_batch.view(-1))
        epoch_loss += (loss.item()/len(eng_batch))
        loss.backward()
        optimizer.step()
        
        if i % MODEL_SAVE_INTERVAL == 0:
            torch.save(encoder.state_dict(), f"encoder.pt")
            torch.save(decoder.state_dict(), f"decoder.pt")
        
#         print(f"batch took {time.time()-time_start} sec")
        
    losses.append(epoch_loss)
    bar.set_description(f'loss: {epoch_loss}')

    if epoch % MODEL_SAVE_INTERVAL == 0:
        torch.save(encoder.state_dict(), f"encoder.pt")
        torch.save(decoder.state_dict(), f"decoder.pt")

torch.save(encoder.state_dict(), f"encoder_final.pt")
torch.save(decoder.state_dict(), f"decoder_final.pt")


  0%|          | 0/200 [00:00<?, ?it/s]

pre-trained models loaded.
training begin.


loss: 6.134352413316565:   0%|          | 1/200 [01:48<6:01:12, 108.91s/it]

In [65]:
if os.path.exists("encoder_final.pt") and os.path.exists("decoder_final.pt"):
    encoder =  EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)  # Replace YourEncoderModelClass with the actual class of your encoder model
    decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)

    encoder.load_state_dict(torch.load("encoder_final.pt"))
    decoder.load_state_dict(torch.load("decoder_final.pt"))
    print("pre-trained models loaded.")


pre-trained models loaded.


In [None]:
def translate_sentence(input_sentence):
    # Preprocess input sentence
    input_sequence = indic_tokenize.trivial_tokenize(input_sentence)
    input_sequence.insert(0,"<s>")
    input_sequence.append("</s>")
    vector =  [kan_word2index[word] for word in input_sequence]
    input_sequence = pad_sequence(vector, kan_word2index['</s>'])
 
    print(torch.tensor(input_sequence))
    encoder_outputs, encoder_hidden = encoder(torch.tensor(input_sequence, device=device).view(-1, 1))
    print(encoder_outputs)
    decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden)
 
    _, topi = decoder_outputs.topk(1) # return largest output of the tensor
    decoded_ids = topi.squeeze()
    print(decoded_ids)

    decoded_words = []
    for idx in decoded_ids[0]:
        if idx.item() == eng_word2index['</s>']:
            decoded_words.append(eng_word2index['</s>'])
            break
        decoded_words.append(idx.item())

    return decoded_words

In [None]:
out = translate_sentence('ಇವರು ಸಂಶೋಧಕ ಸ್ವಭಾವದವರು .')
output = ""
for idx in out:
    #print([key for key, val in eng_word2index.items() if val == idx])
    output +=  [key for key, val in eng_word2index.items() if val == idx][0] + " "
output