In [349]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math
import numpy as np
import matplotlib.pyplot as plt
import heapq
plt.switch_backend('agg')
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [350]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 100

In [351]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.char2index = {}
        self.char2count = {}
        self.index2char = {0: "SOS", 1: "EOS"}
        self.n_chars = 2  # Count SOS and EOS

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

In [352]:
def readLangs(lang, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('aksharantar_sampled/%s/%s_train.csv' % (lang, lang), encoding='utf-8').read().strip().split('\n')
    
    # Split every line into pairs and normalize
    # pairs = [[normalizeString(s) for s in l.split(',')] for l in lines]
    pairs = [[s for s in l.split(',')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang)
        output_lang = Lang('eng')
    else:
        input_lang = Lang('eng')
        output_lang = Lang(lang)

    return input_lang, output_lang, pairs

In [353]:
def prepareData(lang, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang, reverse)
    print("Read %s word pairs" % len(pairs))
    print("Trimmed to %s word pairs" % len(pairs))
    print("Counting chars...")
    for pair in pairs:
        input_lang.addWord(pair[0])
        output_lang.addWord(pair[1])
    print("Counted chars:")
    print(input_lang.name, input_lang.n_chars)
    print(output_lang.name, output_lang.n_chars)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('hin', False)
print(random.choice(pairs))

Reading lines...
Read 51200 word pairs
Trimmed to 51200 word pairs
Counting chars...
Counted chars:
eng 28
hin 66
['chaheiti', 'चहेती']


In [354]:
cells = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}

In [355]:
class EncoderRNN(nn.Module):
    """
    Class for the encoder RNN.
    """
    def __init__(self, input_size, hidden_size, num_hidden_layers, dropout=0, cell_type='gru'):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.cell_type = cell_type
        self.cell = cells[cell_type](hidden_size, hidden_size, num_hidden_layers, dropout=dropout)

    def forward(self, input, hidden, cell=None):
        embedded = self.embedding(input).view(1, 1, -1)
        if self.cell_type == 'rnn' or self.cell_type == 'gru':
            output, hidden = self.cell(embedded, hidden)
            return output, hidden
        else:
            output, (hidden, cell) = self.cell(embedded, (hidden, cell))    
            return output, hidden, cell

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [356]:
class BeamSearchNode:
    def __init__(self, decoder_output, hidden, prev_node, char_idx, log_prob, length):
        self.decoder_output = decoder_output
        self.hidden = hidden
        self.prev_node = prev_node
        self.char_idx = char_idx
        self.log_prob = log_prob
        self.length = length

    def __lt__(self, other):
        return self.log_prob > other.log_prob

    def eval(self):
        return self.log_prob / self.length

In [357]:
class DecoderRNN(nn.Module):
    """
    Class for the vanilla decoder RNN.
    """
    def __init__(self, hidden_size, output_size, num_hidden_layers, dropout=0, cell_type='gru'):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.cell_type = cell_type
        self.cell = cells[cell_type](hidden_size, hidden_size, num_hidden_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, output_size)
        self.output_size = output_size
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell=None):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        if self.cell_type == 'rnn' or self.cell_type == 'gru':
            output, hidden = self.cell(output, hidden)
        else:
            output, (hidden, cell) = self.cell(output, (hidden, cell))
            
        output = self.softmax(self.out(output[0]))

        if self.cell_type == 'rnn' or self.cell_type == 'gru':
            return output, hidden
        else:
            return output, hidden, cell
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

    def beam_search(self, encoder_outputs, decoder_hidden, beam_width, start_token, end_token):
        # Initialize the beam search
        beam_nodes = [BeamSearchNode(None, decoder_hidden, None, start_token, 0, 1)]
        done_nodes = []

        # Keep expanding the beam until we reach the maximum length or all candidates are done
        for _ in range(MAX_LENGTH):
            candidates = []
            for node in beam_nodes:
                if node.char_idx == end_token:
                    done_nodes.append(node)
                    continue

                # Feed the previous char and hidden state into the decoder
                output, hidden = self.forward(torch.tensor([[node.char_idx]]), node.hidden, encoder_outputs)
                # Single depth or double depth for tensor?

                # Generate new candidate nodes and add them to the heap
                for char_idx in range(self.output_size):
                    length = node.length + 1
                    # print(output, node.log_prob)
                    log_prob = node.log_prob + output[0][char_idx].item()
                    # print(log_prob)
                    new_node = BeamSearchNode(output, hidden, node, char_idx, log_prob, length)
                    heapq.heappush(candidates, new_node)

            # Select the top k candidates to continue expanding the beam
            beam_nodes = []
            for _ in range(beam_width):
                if not candidates:
                    break
                beam_nodes.append(heapq.heappop(candidates))

            if not beam_nodes:
                break

        # Return the best candidate as the predicted sequence
        done_nodes.extend(beam_nodes)
        best_node = max(done_nodes, key=lambda node: node.eval())
        # predicted_seq = []
        # while best_node.prev_node is not None:
        #     predicted_seq.append(best_node.char_idx)
        #     best_node = best_node.prev_node
        # return predicted_seq[::-1]
        return best_node

In [358]:
def indexesFromWord(lang, word):
    return [lang.char2index[char] for char in word]


def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromWord(input_lang, pair[0])
    target_tensor = tensorFromWord(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [359]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, cell_type, encoder_optimizer, decoder_optimizer, criterion, beam_width, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    if cell_type == 'lstm':
        encoder_cell = encoder.initHidden()

    loss = 0

    for ei in range(input_length):
        if cell_type == 'lstm':
            encoder_output, encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
        else:
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_hidden = encoder_hidden

    if beam_width == 1:
        decoder_input = torch.tensor([[SOS_token]], device=device)

        if cell_type == 'lstm':
            decoder_cell = encoder_cell

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(target_length):
                # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
                if cell_type == 'lstm':
                    decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
                else:
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                print("Criterion:", criterion(decoder_output, target_tensor[di])) 
                loss += criterion(decoder_output, target_tensor[di])
                print(loss)
                decoder_input = target_tensor[di]  # Teacher forcing

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(target_length):
                # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
                if cell_type == 'lstm':
                    decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
                else:
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                print("Criterion:", criterion(decoder_output, target_tensor[di])) 
                loss += criterion(decoder_output, target_tensor[di])
                print(loss)
                if decoder_input.item() == EOS_token:
                    break

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        return loss.item() / target_length
    
    else:
        predicted_seq = decoder.beam_search(encoder_outputs, decoder_hidden, beam_width, SOS_token, EOS_token)
        # loss = sum([criterion(decoder_output, target_tensor[di]) for di, decoder_output in enumerate(predicted_seq)])
        loss = 0
        # assert len(predicted_seq) == target_length, "Predicted sequence length {} does not match target sequence length {}".format(len(predicted_seq), target_length)
        for di, char_idx in enumerate(predicted_seq):
            if di == target_length:
                break
            decoder_input = torch.tensor([[char_idx]], device=device)
            if cell_type == 'lstm':
                decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])

        # print(loss)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        return loss.item() / len(predicted_seq)

In [360]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [361]:
def trainIters(encoder, decoder, n_iters, optimizer=optim.SGD, cell_type='gru', beam_width=1, print_every=1000, learning_rate=5e-3):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optimizer(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optimizer(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, cell_type, encoder_optimizer, decoder_optimizer, criterion, beam_width)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg))

In [362]:
def evaluate(encoder, decoder, word, beam_width=1, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromWord(input_lang, word)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        if encoder.cell_type == 'lstm':
            encoder_cell = encoder.initHidden()

        for ei in range(input_length):
            if encoder.cell_type == 'lstm':
                encoder_output, encoder_hidden, encoder_cell = encoder(input_tensor[ei], encoder_hidden, encoder_cell)
            else:
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
                
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_hidden = encoder_hidden

        if beam_width == 1:
            decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS


            if encoder.cell_type == 'lstm':
                decoder_cell = encoder_cell

            decoded_chars = ""
            # decoder_attentions = torch.zeros(max_length, max_length)

            for di in range(max_length):
                # decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
                if encoder.cell_type == 'lstm':
                    decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
                else:
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                    
                # decoder_attentions[di] = decoder_attention.data
                topv, topi = decoder_output.data.topk(1)
                if topi.item() == EOS_token:
                    decoded_chars += '<EOS>'
                    break
                else:
                    decoded_chars += output_lang.index2char[topi.item()]

                decoder_input = topi.squeeze().detach()

            # return decoded_chars, decoder_attentions[:di + 1]
            return decoded_chars
        
        else:
            predicted_seq = decoder.beam_search(encoder_outputs, decoder_hidden, beam_width, SOS_token, EOS_token)
            return ''.join([output_lang.index2char[char_idx] for char_idx in predicted_seq])

In [363]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        # output_chars, attentions = evaluate(encoder, decoder, pair[0])
        output_word = evaluate(encoder, decoder, pair[0])
        # output_word = ''.join(output_chars)
        print('<', output_word)
        print('')

In [364]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_chars, hidden_size, num_hidden_layers=1).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_chars, num_hidden_layers=1).to(device)

In [365]:
trainIters(encoder1, decoder1, n_iters=1000, beam_width=5, print_every=100)

0m 27s (- 4m 11s) (100 10%) 3.0889
0m 50s (- 3m 20s) (200 20%) 2.8824
1m 10s (- 2m 44s) (300 30%) 3.2302
1m 31s (- 2m 17s) (400 40%) 3.0131
1m 50s (- 1m 50s) (500 50%) 3.3526
2m 10s (- 1m 26s) (600 60%) 3.3453
2m 28s (- 1m 3s) (700 70%) 3.3008
2m 45s (- 0m 41s) (800 80%) 3.3673
3m 3s (- 0m 20s) (900 90%) 3.2582
3m 21s (- 0m 0s) (1000 100%) 3.2475


In [366]:
evaluateRandomly(encoder1, decoder1)

> nirgunmargi
= निर्गुणमार्गी
< पवररिरा<EOS>

> sairaaz
= सैराज़
< पवररिर<EOS>

> kalhaili
= कलहैली
< पवररिर<EOS>

> sambhave
= सम्भवे
< पवरर्<EOS>

> sattachyut
= सत्ताच्यूत
< सवरर्<EOS>

> vilima
= विलिमा
< सवरर्<EOS>

> nilakantan
= नीलाकांतन
< ववरिररा<EOS>

> baalo
= बालों
< सवरर्<EOS>

> trogir
= ट्रोगिर
< पवररिर<EOS>

> coric
= कोरिच
< पवररिर<EOS>



In [367]:
pair = random.choice(pairs)

print(pair[0])
print(pair[1])

with torch.no_grad():
    input_tensor = tensorFromWord(input_lang, pair[0])
    input_length = input_tensor.size()[0]
    encoder1_hidden = encoder1.initHidden()

    encoder1_outputs = torch.zeros(MAX_LENGTH, encoder1.hidden_size, device=device)

    if encoder1.cell_type == 'lstm':
        encoder1_cell = encoder1.initHidden()

    for ei in range(input_length):
        if encoder1.cell_type == 'lstm':
            encoder1_output, encoder1_hidden, encoder1_cell = encoder1(input_tensor[ei], encoder1_hidden, encoder1_cell)
        else:
            encoder1_output, encoder1_hidden = encoder1(input_tensor[ei], encoder1_hidden)
            
        encoder1_outputs[ei] += encoder1_output[0, 0]

    decoder1_hidden = encoder1_hidden

    if encoder1.cell_type == 'lstm':
        decoder1_cell = encoder1_cell

    decoded_chars = ""

    # Predict the output using beam search
    predicted_seq = decoder1.beam_search(encoder1_outputs, decoder1_hidden, 5, SOS_token, EOS_token)

    # Convert the predicted sequence to a word
    for char_idx in predicted_seq:
        decoded_chars += output_lang.index2char[char_idx]

    print(decoded_chars)

    



deligation
डेलिगेशन
सरिरर्ाEOS
