In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

from tqdm import tqdm, trange

!pip install indic_nlp_library==0.92
from indicnlp.tokenize import indic_tokenize

Collecting indic_nlp_library==0.92
  Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinx-argparse (from indic_nlp_library==0.92)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting sphinx-rtd-theme (from indic_nlp_library==0.92)
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting morfessor (from indic_nlp_library==0.92)
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic_nlp_library==0.92)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [3]:
MAX_LENGTH = 17 # maximum length of sentences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded)
        return output, (hidden, cell)

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.lstm = nn.LSTM(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(0)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden[0].permute(1, 0, 2)  # For LSTM, hidden is a tuple (hidden_state, cell_state)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_lstm = torch.cat((embedded, context), dim=2)

        output, hidden = self.lstm(input_lstm, hidden)
        output = self.out(output)

        return output, hidden, attn_weights


class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, index):
        return self.source_sentences[index], self.target_sentences[index]


def pad_sequence(sequence, pad_value):
    # Padding function to add pad_value to sequences until they reach max_len
    for i in range(MAX_LENGTH - len(sequence)):
        sequence.append(pad_value)
    return sequence

In [5]:

# get tokens from pre-processed files
with open('eng_tokens_baby.txt', 'r') as f:
    tokens = f.readlines()
eng_tokens = []
for x in trange(len(tokens), desc='get english tokens...'):
    eng_tokens.append(tokens[x].strip('\n').split(' '))
print(eng_tokens[0])

with open('kan_tokens_baby.txt', 'r', encoding='utf-8') as f:
    tokens = f.readlines()
kan_tokens = []
for x in trange(len(tokens), desc='get kannada tokens...'):
    kan_tokens.append(tokens[x].strip('\n').split(' '))
print(kan_tokens[0])

# get vocabulary
eng_vocab = set()
kan_vocab = set()
for i in eng_tokens:
    for j in i:
        eng_vocab.add(j)
eng_vocab = list(eng_vocab)

for i in kan_tokens:
    for j in i:
        kan_vocab.add(j)
kan_vocab = list(kan_vocab)

print(eng_vocab[:10])
print(kan_vocab[:10])

get english tokens...: 100%|██████████| 10000/10000 [00:00<00:00, 451612.29it/s]


['<s>', 'Hes', 'a', 'scientist', '.', '</s>']


get kannada tokens...: 100%|██████████| 10000/10000 [00:00<00:00, 227190.71it/s]


['<s>', 'ಇವರು', 'ಸಂಶೋಧಕ', 'ಸ್ವಭಾವದವರು', '.', '</s>']
['gracious', 'earliest', 'cope', 'Sirisena', 'Diabetes', 'pans', 'considering', 'guts', 'produces', 'uncertainties']
['ಏಕದಿನ', 'ಚಿಂತೆಗಳು', 'ರೀತಿಯ', 'ಮಿಡಿಯಿತು………', 'ಆಟದ', 'ಇದು”', 'ಬಯಸುವವರು', 'ಏಕತೆಯೇ', 'ಪೂರ್ವಸಿದ್ಧತೆಗಳನ್ನು', 'ಅಂತಿಮಗೊಂಡಿಲ್ಲ']


In [6]:
# get index lists
eng_word2index = {word: index for index, word in enumerate(eng_vocab)}
kan_word2index = {word: index for index, word in enumerate(kan_vocab)}

In [7]:
eng_indices = [[eng_word2index[word] for word in sent] for sent in eng_tokens]
kan_indices = [[kan_word2index[word] for word in sent] for sent in kan_tokens]

In [8]:
eng_indices_padded = []
kan_indices_padded = []

for eng_sent, kan_sent in zip(eng_tokens, kan_tokens):
    if len(eng_sent) <= MAX_LENGTH and len(kan_sent) <= MAX_LENGTH:
        eng_indices_padded.append(pad_sequence([eng_word2index[word] for word in eng_sent], eng_word2index['</s>']))
        kan_indices_padded.append(pad_sequence([kan_word2index[word] for word in kan_sent], kan_word2index['</s>']))
    else:
        continue


# Ensure that both lists have the same length


In [9]:


print(len(eng_indices_padded) == len(kan_indices_padded), f"Number of English sentences: {len(eng_indices_padded)}, Number of Kannada sentences: {len(kan_indices_padded)}")


True Number of English sentences: 7957, Number of Kannada sentences: 7957


In [12]:
kan_train, kan_test, eng_train, eng_test = train_test_split(kan_indices_padded, eng_indices_padded, test_size=0.3)

# Create datasets and dataloaders for train and test sets
train_dataset = TranslationDataset(kan_train, eng_train)
test_dataset = TranslationDataset(kan_test, eng_test)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [13]:
batch_size = 24
eng_indices1 = eng_indices_padded[:10000]
kan_indices1 = kan_indices_padded[:10000]
dataset = TranslationDataset(kan_indices1, eng_indices1)
print(dataset[0])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



([8521, 24184, 24768, 13704, 22693, 19719, 19719, 19719, 19719, 19719, 19719, 19719, 19719, 19719, 19719, 19719, 19719], [3584, 13263, 14098, 9398, 4549, 13123, 13123, 13123, 13123, 13123, 13123, 13123, 13123, 13123, 13123, 13123, 13123])


In [37]:
lr = 0.01
epochs = 500
hidden_size = 128
encoder = EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)
optimizer = Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:

if os.path.exists("encoder1.pth") and os.path.exists("decoder1.pth"):
    encoder =  EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)  # Replace YourEncoderModelClass with the actual class of your encoder model
    decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)

    encoder.load_state_dict(torch.load("encoder_final1.pth"))
    decoder.load_state_dict(torch.load("decoder_final1.pth"))
    print("pre-trained models loaded.")


print("training begin.")
import time

# Training loop
MODEL_SAVE_INTERVAL = 10 # save the model every so oftens
losses = [] # average loss per epoch
bar = trange(epochs, desc=f'')
for epoch in bar:
    epoch_loss = 0
    for i, (kan_batch,eng_batch) in enumerate(dataloader): # TO-DO - Need to pad the data
        time_start = time.time()
        eng_batch = torch.stack(eng_batch, dim=1)
        kan_batch = torch.stack(kan_batch, dim=1)

        eng_batch = eng_batch.to(device)
        kan_batch = kan_batch.to(device)

        optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(kan_batch)
        decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden, target_tensor=eng_batch)

        loss = criterion(decoder_outputs.view(-1, len(eng_vocab)), eng_batch.view(-1))
        epoch_loss += (loss.item())
        loss.backward()
        optimizer.step()

        if i % MODEL_SAVE_INTERVAL == 0:
            torch.save(encoder.state_dict(), f"encoder1.pth")
            torch.save(decoder.state_dict(), f"decoder1.pth")

#         print(f"batch took {time.time()-time_start} sec")
    epoch_loss /= len(eng_batch)
    losses.append(epoch_loss)
    bar.set_description(f'loss: {epoch_loss}')

    if epoch % MODEL_SAVE_INTERVAL == 0:
        torch.save(encoder.state_dict(), f"encoder1.pth")
        torch.save(decoder.state_dict(), f"decoder1.pth")

torch.save(encoder.state_dict(), f"encoder_final1.pth")
torch.save(decoder.state_dict(), f"decoder_final1.pth")


pre-trained models loaded.
training begin.


loss: 9.202878100367693:   8%|▊         | 41/500 [10:15<1:54:55, 15.02s/it]

In [None]:
if os.path.exists("encoder1.pth") and os.path.exists("decoder1.pth"):
    encoder =  EncoderRNN(input_size=len(kan_vocab), hidden_size=hidden_size).to(device)  # Replace YourEncoderModelClass with the actual class of your encoder model
    decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=len(eng_vocab)).to(device)

    encoder.load_state_dict(torch.load("encoder_final1.pth"))
    decoder.load_state_dict(torch.load("decoder_final1.pth"))
    print("pre-trained models loaded.")


In [None]:
def translate_sentence(input_sentence):
    # Preprocess input sentence
    input_sequence = indic_tokenize.trivial_tokenize(input_sentence)
    input_sequence.insert(0,"<s>")
    input_sequence.append("</s>")
    vector =  [kan_word2index[word] for word in input_sequence]
    input_sequence = pad_sequence(vector, kan_word2index['</s>'])

    print(torch.tensor(input_sequence))
    encoder_outputs, encoder_hidden = encoder(torch.tensor(input_sequence, device=device).view(-1, 1))
#     print(encoder_outputs)
    decoder_outputs, decoder_hidden, attentions = decoder(encoder_outputs, encoder_hidden)

    _, topi = decoder_outputs.topk(1) # return largest output of the tensor
    decoded_ids = topi.squeeze()
#     print(decoded_ids)

    decoded_words = []
    for idx in decoded_ids[0]:
        if idx.item() == eng_word2index['</s>']:
            decoded_words.append(eng_word2index['</s>'])
            break
        decoded_words.append(idx.item())

    return decoded_words

In [None]:
out = translate_sentence( 'ಇವರು ಸಂಶೋಧಕ ಸ್ವಭಾವದವರು')
output = ""
for idx in out:
    #print([key for key, val in eng_word2index.items() if val == idx])
    output +=  [key for key, val in eng_word2index.items() if val == idx][0] + " "
output

In [None]:
def evaluate(encoder, decoder, dataloader, reference_translations):
    references = []
    translations = []
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for kan_batch, eng_batch in tqdm(dataloader, desc='Evaluating'):
            kan_batch = torch.stack(kan_batch, dim=1).to(device)
            eng_batch = torch.stack(eng_batch, dim=1).to(device)

            encoder_outputs, encoder_hidden = encoder(kan_batch)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor=eng_batch)

            _, preds = decoder_outputs.max(2)

            references.extend(eng_batch.cpu().numpy().tolist())
            translations.extend(preds.cpu().numpy().tolist())

            # Calculate accuracy and precision
            for pred_sent, ref_sent in zip(preds.cpu().numpy().tolist(), eng_batch.cpu().numpy().tolist()):
                total_predictions += len(ref_sent)
                for pred_token, ref_token in zip(pred_sent, ref_sent):
                    if pred_token == ref_token:
                        correct_predictions += 1

    # Calculate BLEU score
    # bleu_score = corpus_bleu(reference_translations, translations)

    # Calculate accuracy and precision
    accuracy = correct_predictions / total_predictions
    precision = accuracy  # Assuming precision and accuracy are the same in this context

    return 0, accuracy, precision

# Evaluate the model
bleu_score, accuracy, precision = evaluate(encoder, decoder, test_dataloader, eng_test)

print("BLEU Score:", bleu_score)
print("Accuracy:", accuracy)
print("Precision:", precision)


In [None]:

def evaluate(encoder, decoder, dataloader, reference_translations):
    references = []
    translations = []
    with torch.no_grad():
        for kan_batch, eng_batch in tqdm(dataloader, desc='Evaluating'):
            kan_batch = torch.stack(kan_batch, dim=1).to(device)
            eng_batch = torch.stack(eng_batch, dim=1).to(device)

            encoder_outputs, encoder_hidden = encoder(kan_batch)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor=eng_batch)

            _, preds = decoder_outputs.max(2)

            for pred_sent, ref_sent in zip(preds.cpu().numpy().tolist(), eng_batch.cpu().numpy().tolist()):
                # Convert indices back to tokens
                pred_tokens = [eng_vocab[token_idx] for token_idx in pred_sent if token_idx != eng_word2index['</s>']]
                ref_tokens = [eng_vocab[token_idx] for token_idx in ref_sent if token_idx != eng_word2index['</s>']]
                references.append([ref_tokens])
                translations.append(pred_tokens)

    # Calculate BLEU score
    bleu_score = corpus_bleu(references, translations)

    return bleu_score

# Evaluate the model
bleu_score = evaluate(encoder, decoder, test_dataloader, eng_test)

print("BLEU Score:", bleu_score)
