In [37]:
import torch
import torch.nn as nn
import numpy as np
import random

import os
import sys
import json
import csv

from tqdm import tqdm

In [38]:
# Melihat GPU yang tersedia dan penggunaannya.
!nvidia-smi --query-gpu=index,memory.used,memory.total,memory.free,utilization.gpu --format=csv,noheader | awk -F "," 'BEGIN{printf "%-3s \t%7s\t%5s\t%12s\n", "GPU", "Memory", "Free", "RAM"}{printf "%s:\t%6.2f%%\t%7s\t%s\n", $1, ($2/$3)*100, $4, $5}'

GPU 	 Memory	 Free	         RAM
0:	 14.03%	 34851 MiB	 0 %
1:	  0.01%	 40534 MiB	 0 %
2:	  2.73%	 39429 MiB	 0 %
3:	  0.01%	 40534 MiB	 0 %
4:	  0.01%	 40534 MiB	 0 %
5:	  0.01%	 40534 MiB	 0 %
6:	  0.01%	 40534 MiB	 0 %
7:	 91.09%	 3611 MiB	 57 %


In [39]:
# Memilih GPU yang akan digunakan (contohnya: GPU #7)
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#Networks

###Encoder

In [40]:
class Encoder(nn.Module):
    '''
    Simple RNN based encoder network
    '''
    def __init__(self, input_dim, embed_dim, hidden_dim ,
                       rnn_type = 'gru', layers = 1,
                       bidirectional =False,
                       dropout = 0, device = "cpu"):
        super(Encoder, self).__init__()

        self.input_dim = input_dim #src_vocab_sz
        self.enc_embed_dim = embed_dim
        self.enc_hidden_dim = hidden_dim
        self.enc_rnn_type = rnn_type
        self.enc_layers = layers
        self.enc_directions = 2 if bidirectional else 1
        self.device = device

        self.embedding = nn.Embedding(self.input_dim, self.enc_embed_dim)

        if self.enc_rnn_type == "gru":
            self.enc_rnn = nn.GRU(input_size= self.enc_embed_dim,
                          hidden_size= self.enc_hidden_dim,
                          num_layers= self.enc_layers,
                          bidirectional= bidirectional)
        elif self.enc_rnn_type == "lstm":
            self.enc_rnn = nn.LSTM(input_size= self.enc_embed_dim,
                          hidden_size= self.enc_hidden_dim,
                          num_layers= self.enc_layers,
                          bidirectional= bidirectional)
        else:
            raise Exception("unknown RNN type mentioned")

    def forward(self, x, x_sz, hidden = None):
        '''
        x_sz: (batch_size, 1) -  Unpadded sequence lengths used for pack_pad

        Return:
            output: (batch_size, max_length, hidden_dim)
            hidden: (n_layer*num_directions, batch_size, hidden_dim) | if LSTM tuple -(h_n, c_n)

        '''
        batch_sz = x.shape[0]
        # x: batch_size, max_length, enc_embed_dim
        x = self.embedding(x)

        ## pack the padded data
        # x: max_length, batch_size, enc_embed_dim -> for pack_pad
        x = x.permute(1,0,2)
        x = nn.utils.rnn.pack_padded_sequence(x, x_sz, enforce_sorted=False) # unpad

        # output: packed_size, batch_size, enc_embed_dim --> hidden from all timesteps
        # hidden: n_layer**num_directions, batch_size, hidden_dim | if LSTM (h_n, c_n)
        output, hidden = self.enc_rnn(x)

        ## pad the sequence to the max length in the batch
        # output: max_length, batch_size, enc_emb_dim*directions)
        output, _ = nn.utils.rnn.pad_packed_sequence(output)

        # output: batch_size, max_length, hidden_dim
        output = output.permute(1,0,2)

        return output, hidden

###Decoder

In [41]:
class Decoder(nn.Module):
    '''
    Used as decoder stage
    '''
    def __init__(self, output_dim, embed_dim, hidden_dim,
                       rnn_type = 'gru', layers = 1,
                       use_attention = True,
                       enc_outstate_dim = None, # enc_directions * enc_hidden_dim
                       dropout = 0, device = "cpu"):
        super(Decoder, self).__init__()

        self.output_dim = output_dim #tgt_vocab_sz
        self.dec_hidden_dim = hidden_dim
        self.dec_embed_dim = embed_dim
        self.dec_rnn_type = rnn_type
        self.dec_layers = layers
        self.use_attention = use_attention
        self.device = device
        if self.use_attention:
            self.enc_outstate_dim = enc_outstate_dim if enc_outstate_dim else hidden_dim
        else:
            self.enc_outstate_dim = 0


        self.embedding = nn.Embedding(self.output_dim, self.dec_embed_dim)

        if self.dec_rnn_type == 'gru':
            self.dec_rnn = nn.GRU(input_size= self.dec_embed_dim + self.enc_outstate_dim, # to concat attention_output
                          hidden_size= self.dec_hidden_dim, # previous Hidden
                          num_layers= self.dec_layers,
                          batch_first = True )
        elif self.dec_rnn_type == "lstm":
            self.dec_rnn = nn.LSTM(input_size= self.dec_embed_dim + self.enc_outstate_dim, # to concat attention_output
                          hidden_size= self.dec_hidden_dim, # previous Hidden
                          num_layers= self.dec_layers,
                          batch_first = True )
        else:
            raise Exception("unknown RNN type mentioned")

        self.fc = nn.Sequential(
            nn.Linear(self.dec_hidden_dim, self.dec_embed_dim), nn.LeakyReLU(),
            # nn.Linear(self.dec_embed_dim, self.dec_embed_dim), nn.LeakyReLU(), # removing to reduce size
            nn.Linear(self.dec_embed_dim, self.output_dim),
            )

        ##----- Attention ----------
        if self.use_attention:
            self.W1 = nn.Linear( self.enc_outstate_dim, self.dec_hidden_dim)
            self.W2 = nn.Linear( self.dec_hidden_dim, self.dec_hidden_dim)
            self.V = nn.Linear( self.dec_hidden_dim, 1)

    def attention(self, x, hidden, enc_output):
        '''
        x: (batch_size, 1, dec_embed_dim) -> after Embedding
        enc_output: batch_size, max_length, enc_hidden_dim *num_directions
        hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n)
        '''

        ## perform addition to calculate the score

        # hidden_with_time_axis: batch_size, 1, hidden_dim
        ## hidden_with_time_axis = hidden.permute(1, 0, 2) ## replaced with below 2lines
        hidden_with_time_axis = torch.sum(hidden, axis=0)

        hidden_with_time_axis = hidden_with_time_axis.unsqueeze(1)

        # score: batch_size, max_length, hidden_dim
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))

        # attention_weights: batch_size, max_length, 1
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = torch.softmax(self.V(score), dim=1)

        # context_vector shape after sum == (batch_size, hidden_dim)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        # context_vector: batch_size, 1, hidden_dim
        context_vector = context_vector.unsqueeze(1)

        # attend_out (batch_size, 1, dec_embed_dim + hidden_size)
        attend_out = torch.cat((context_vector, x), -1)

        return attend_out, attention_weights

    def forward(self, x, hidden, enc_output):
        '''
        x: (batch_size, 1)
        enc_output: batch_size, max_length, dec_embed_dim
        hidden: n_layer, batch_size, hidden_size | lstm: (h_n, c_n)
        '''
        if (hidden is None) and (self.use_attention is False):
            raise Exception( "No use of a decoder with No attention and No Hidden")

        batch_sz = x.shape[0]

        if hidden is None:
            # hidden: n_layers, batch_size, hidden_dim
            hid_for_att = torch.zeros((self.dec_layers, batch_sz,
                                    self.dec_hidden_dim )).to(self.device)
        elif self.dec_rnn_type == 'lstm':
            hid_for_att = hidden[0] # h_n
        else:
            hid_for_att = hidden

        # x (batch_size, 1, dec_embed_dim) -> after embedding
        x = self.embedding(x)

        if self.use_attention:
            # x (batch_size, 1, dec_embed_dim + hidden_size) -> after attention
            # aw: (batch_size, max_length, 1)
            x, aw = self.attention( x, hid_for_att, enc_output)
        else:
            x, aw = x, 0

        # passing the concatenated vector to the GRU
        # output: (batch_size, n_layers, hidden_size)
        # hidden: n_layers, batch_size, hidden_size | if LSTM (h_n, c_n)
        output, hidden = self.dec_rnn(x, hidden) if hidden is not None else self.dec_rnn(x)

        # output :shp: (batch_size * 1, hidden_size)
        output =  output.view(-1, output.size(2))

        # output :shp: (batch_size * 1, output_dim)
        output = self.fc(output)

        return output, hidden, aw

### Seq2Seq Connection

In [42]:
class Seq2Seq(nn.Module):
    '''
    Used to construct seq2seq architecture with encoder decoder objects
    '''
    def __init__(self, encoder, decoder, pass_enc2dec_hid=False, dropout = 0, device = "cpu"):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.pass_enc2dec_hid = pass_enc2dec_hid

        if self.pass_enc2dec_hid:
            assert decoder.dec_hidden_dim == encoder.enc_hidden_dim, "Hidden Dimension of encoder and decoder must be same, or unset `pass_enc2dec_hid`"
        if decoder.use_attention:
            assert decoder.enc_outstate_dim == encoder.enc_directions*encoder.enc_hidden_dim,"Set `enc_out_dim` correctly in decoder"
        assert self.pass_enc2dec_hid or decoder.use_attention, "No use of a decoder with No attention and No Hidden from Encoder"


    def forward(self, src, tgt, src_sz, teacher_forcing_ratio = 0):
        '''
        src: (batch_size, sequence_len.padded)
        tgt: (batch_size, sequence_len.padded)
        src_sz: [batch_size, 1] -  Unpadded sequence lengths
        '''
        batch_size = tgt.shape[0]

        # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
        # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
        enc_output, enc_hidden = self.encoder(src, src_sz)

        if self.pass_enc2dec_hid:
           # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            dec_hidden = enc_hidden
        else:
            # dec_hidden -> Will be initialized to zeros internally
            dec_hidden = None

        # pred_vecs: (batch_size, output_dim, sequence_sz) -> shape required for CELoss
        pred_vecs = torch.zeros(batch_size, self.decoder.output_dim, tgt.size(1)).to(self.device)

        # dec_input: (batch_size, 1)
        dec_input = tgt[:,0].unsqueeze(1) # initialize to start token
        pred_vecs[:,1,0] = 1 # Initialize to start tokens all batches
        for t in range(1, tgt.size(1)):
            # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            # dec_output: batch_size, output_dim
            # dec_input: (batch_size, 1)
            dec_output, dec_hidden, _ = self.decoder( dec_input,
                                               dec_hidden,
                                               enc_output,  )
            pred_vecs[:,:,t] = dec_output

            # # prediction: batch_size
            prediction = torch.argmax(dec_output, dim=1)

            # Teacher Forcing
            if random.random() < teacher_forcing_ratio:
                dec_input = tgt[:, t].unsqueeze(1)
            else:
                dec_input = prediction.unsqueeze(1)

        return pred_vecs #(batch_size, output_dim, sequence_sz)

    def inference(self, src, max_tgt_sz=50, debug = 0):
        '''
        single input only, No batch Inferencing
        src: (sequence_len)
        debug: if True will return attention weights also
        '''
        batch_size = 1
        start_tok = src[0]
        end_tok = src[-1]
        src_sz = torch.tensor([len(src)])
        src_ = src.unsqueeze(0)

        # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
        # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
        enc_output, enc_hidden = self.encoder(src_, src_sz)

        if self.pass_enc2dec_hid:
            # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            dec_hidden = enc_hidden
        else:
            # dec_hidden -> Will be initialized to zeros internally
            dec_hidden = None

        # pred_arr: (sequence_sz, 1) -> shape required for CELoss
        pred_arr = torch.zeros(max_tgt_sz, 1).to(self.device)
        if debug: attend_weight_arr = torch.zeros(max_tgt_sz, len(src)).to(self.device)

        # dec_input: (batch_size, 1)
        dec_input = start_tok.view(1,1) # initialize to start token
        pred_arr[0] = start_tok.view(1,1) # initialize to start token
        for t in range(max_tgt_sz):
            # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            # dec_output: batch_size, output_dim
            # dec_input: (batch_size, 1)
            dec_output, dec_hidden, aw = self.decoder( dec_input,
                                               dec_hidden,
                                               enc_output,  )
            # prediction :shp: (1,1)
            prediction = torch.argmax(dec_output, dim=1)
            dec_input = prediction.unsqueeze(1)
            pred_arr[t] = prediction
            if debug: attend_weight_arr[t] = aw.squeeze(-1)

            if torch.eq(prediction, end_tok):
                break

        if debug: return pred_arr.squeeze(), attend_weight_arr
        # pred_arr :shp: (sequence_len)
        return pred_arr.squeeze().to(dtype=torch.long)


    def active_beam_inference(self, src, beam_width=3, max_tgt_sz=50):
        ''' Active beam Search based decoding
        src: (sequence_len)
        '''
        def _avg_score(p_tup):
            ''' Used for Sorting
            TODO: Dividing by length of sequence power alpha as hyperparam
            '''
            return p_tup[0]

        batch_size = 1
        start_tok = src[0]
        end_tok = src[-1]
        src_sz = torch.tensor([len(src)])
        src_ = src.unsqueeze(0)

        # enc_output: (batch_size, padded_seq_length, enc_hidden_dim*num_direction)
        # enc_hidden: (enc_layers*num_direction, batch_size, hidden_dim)
        enc_output, enc_hidden = self.encoder(src_, src_sz)

        if self.pass_enc2dec_hid:
            # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            init_dec_hidden = enc_hidden
        else:
            # dec_hidden -> Will be initialized to zeros internally
            init_dec_hidden = None

        # top_pred[][0] = Σ-log_softmax
        # top_pred[][1] = sequence torch.tensor shape: (1)
        # top_pred[][2] = dec_hidden
        top_pred_list = [ (0, start_tok.unsqueeze(0) , init_dec_hidden) ]

        for t in range(max_tgt_sz):
            cur_pred_list = []

            for p_tup in top_pred_list:
                if p_tup[1][-1] == end_tok:
                    cur_pred_list.append(p_tup)
                    continue

                # dec_hidden: dec_layers, 1, hidden_dim
                # dec_output: 1, output_dim
                dec_output, dec_hidden, _ = self.decoder( x = p_tup[1][-1].view(1,1), #dec_input: (1,1)
                                                    hidden = p_tup[2],
                                                    enc_output = enc_output, )

                ## π{prob} = Σ{log(prob)} -> to prevent diminishing
                # dec_output: (1, output_dim)
                dec_output = nn.functional.log_softmax(dec_output, dim=1)
                # pred_topk.values & pred_topk.indices: (1, beam_width)
                pred_topk = torch.topk(dec_output, k=beam_width, dim=1)

                for i in range(beam_width):
                    sig_logsmx_ = p_tup[0] + pred_topk.values[0][i]
                    # seq_tensor_ : (seq_len)
                    seq_tensor_ = torch.cat( (p_tup[1], pred_topk.indices[0][i].view(1)) )

                    cur_pred_list.append( (sig_logsmx_, seq_tensor_, dec_hidden) )

            cur_pred_list.sort(key = _avg_score, reverse =True) # Maximized order
            top_pred_list = cur_pred_list[:beam_width]

            # check if end_tok of all topk
            end_flags_ = [1 if t[1][-1] == end_tok else 0 for t in top_pred_list]
            if beam_width == sum( end_flags_ ): break

        pred_tnsr_list = [t[1] for t in top_pred_list ]

        return pred_tnsr_list

    def passive_beam_inference(self, src, beam_width = 7, max_tgt_sz=50):
        '''
        Passive Beam search based inference
        src: (sequence_len)
        '''
        def _avg_score(p_tup):
            ''' Used for Sorting
            TODO: Dividing by length of sequence power alpha as hyperparam
            '''
            return  p_tup[0]

        def _beam_search_topk(topk_obj, start_tok, beam_width):
            ''' search for sequence with maxim prob
            topk_obj[x]: .values & .indices shape:(1, beam_width)
            '''
            # top_pred_list[x]: tuple(prob, seq_tensor)
            top_pred_list = [ (0, start_tok.unsqueeze(0) ), ]

            for obj in topk_obj:
                new_lst_ = list()
                for itm in top_pred_list:
                    for i in range(beam_width):
                        sig_logsmx_ = itm[0] + obj.values[0][i]
                        seq_tensor_ = torch.cat( (itm[1] , obj.indices[0][i].view(1) ) )
                        new_lst_.append( (sig_logsmx_, seq_tensor_) )

                new_lst_.sort(key = _avg_score, reverse =True)
                top_pred_list = new_lst_[:beam_width]
            return top_pred_list

        batch_size = 1
        start_tok = src[0]
        end_tok = src[-1]
        src_sz = torch.tensor([len(src)])
        src_ = src.unsqueeze(0)

        enc_output, enc_hidden = self.encoder(src_, src_sz)

        if self.pass_enc2dec_hid:
            # dec_hidden: dec_layers, batch_size , dec_hidden_dim
            dec_hidden = enc_hidden
        else:
            # dec_hidden -> Will be initialized to zeros internally
            dec_hidden = None

        # dec_input: (1, 1)
        dec_input = start_tok.view(1,1) # initialize to start token


        topk_obj = []
        for t in range(max_tgt_sz):
            dec_output, dec_hidden, aw = self.decoder( dec_input,
                                               dec_hidden,
                                               enc_output,  )

            ## π{prob} = Σ{log(prob)} -> to prevent diminishing
            # dec_output: (1, output_dim)
            dec_output = nn.functional.log_softmax(dec_output, dim=1)
            # pred_topk.values & pred_topk.indices: (1, beam_width)
            pred_topk = torch.topk(dec_output, k=beam_width, dim=1)

            topk_obj.append(pred_topk)

            # dec_input: (1, 1)
            dec_input = pred_topk.indices[0][0].view(1,1)
            if torch.eq(dec_input, end_tok):
                break

        top_pred_list = _beam_search_topk(topk_obj, start_tok, beam_width)
        pred_tnsr_list = [t[1] for t in top_pred_list ]

        return pred_tnsr_list


#Data Handling

### Unicodes

Add necessary Unicodes for specific script(langauge) below as a list.

### Glyph handler

In [139]:
class GlyphStrawboss():
    def __init__(self, lang_script):
        """ list of letters in a language in unicode
        lang: List with unicodes
        """
        self.glyphs = lang_script

        self.char2idx = {}
        self.idx2char = {}
        self._create_index()

    def _create_index(self):

        self.char2idx['_'] = 0  #pad
        self.char2idx['$'] = 1  #start
        self.char2idx['#'] = 2  #end
        self.char2idx['*'] = 3  #Mask
        self.char2idx["'"] = 4  #apostrophe U+0027
        self.char2idx['%'] = 5  #unused
        self.char2idx['!'] = 6  #unused
        
        self.glyphs = [char for char in self.glyphs if char not in self.char2idx]

        # letter to index mapping
        for idx, char in enumerate(self.glyphs):
            self.char2idx[char] = idx + 7 # +7 token initially

        # index to letter mapping
        for char, idx in self.char2idx.items():
            self.idx2char[idx] = char

    def size(self):
        return len(self.char2idx)


    def word2xlitvec(self, word):
        """ Converts given string of gyphs(word) to vector(numpy)
        Also adds tokens for start and end
        """
        try:
            vec = [self.char2idx['$']] #start token
            for i in list(word):
                vec.append(self.char2idx[i])
            vec.append(self.char2idx['#']) #end token

            vec = np.asarray(vec, dtype=np.int64)
            return vec

        except Exception as error:
            print("Error In word:", word, "Error Char not in Token:", error)
            sys.exit()

    def xlitvec2word(self, vector):
        """ Converts vector(numpy) to string of glyphs(word)
        """
        char_list = []
        for i in vector:
            char_list.append(self.idx2char[i])

        word = "".join(char_list).replace('$','').replace('#','') # remove tokens
        word = word.replace("_", "").replace('*','') # remove tokens
        return word

Datasets & Dataloaders

In [122]:
TRAIN_FILE = "dataset.json"

with open(TRAIN_FILE, 'r') as f:
    old_json = json.load(f)
new_json = {}

for elem in old_json:
    new_json[elem[0].lower()] = [elem[1]]

TRAIN_FILE = "train_dataset.json"
TEST_FILE = "test_dataset.json"

with open(TRAIN_FILE, "w") as outfile:
    json.dump(new_json, outfile)

with open(TEST_FILE, "w") as outfile:
    json.dump(dict(list(new_json.items())[:int(len(new_json)/1000)]), outfile)

In [155]:
indoarab_num = [chr(alpha) for alpha in range(48, 58)]

english_lower_script = set()

jawi_script = jawi_script = {
    chr(0x200c), # ZeroWidth-NonJoiner U+200c
    chr(0x200d), # ZeroWidthJoiner U+200d
}

for elem in old_json:
  for char in elem[0]:
    english_lower_script.add(char.lower())

  for char in elem[1]:
    jawi_script.add(char)

english_lower_script, jawi_script = list(english_lower_script), list(jawi_script)

In [156]:
# from `Datahandling` section
# with open('jawi_glyph.pkl', "rb") as file:
#     src_glyph = pickle.load(file)
# with open('rumi_glyph.pkl', "rb") as file:
#     tgt_glyph = pickle.load(file)

src_glyph = GlyphStrawboss(jawi_script)
tgt_glyph = GlyphStrawboss(english_lower_script)

### Dataset Class

In [157]:
from torch.utils.data import Dataset

class XlitData(Dataset):
    """ Backtransliteration from English to Native Language
    JSON format only
    depends on: Numpy
    """
    def __init__(self, src_glyph_obj, tgt_glyph_obj,
                    json_file, file_map = "LangEn",
                    padding = True, max_seq_size = None,
                 ):
        """
        padding: Set True if Padding with zeros is required for Batching
        max_seq_size: Size for Padding both input and output, Longer words will be truncated
                      If unset computes maximum of source, target seperate
        """
        #Load data
        if file_map == "LangEn": # output-input
            tgt_str, src_str = self._json2_k_v(json_file)
        elif file_map == "EnLang": # input-output
            src_str, tgt_str = self._json2_k_v(json_file)
        else:
            raise Exception('Unknown JSON structure')

        self.src_glyph = src_glyph_obj
        self.tgt_glyph = tgt_glyph_obj

        __svec = self.src_glyph.word2xlitvec
        __tvec = self.tgt_glyph.word2xlitvec
        self.src = [ __svec(s)  for s in src_str]
        self.tgt = [ __tvec(s)  for s in tgt_str]

        self.tgt_class_weights = self._char_class_weights(self.tgt)

        self.padding = padding
        if max_seq_size:
            self.max_tgt_size = max_seq_size
            self.max_src_size = max_seq_size
        else:
            self.max_src_size = max(len(t) for t in self.src)
            self.max_tgt_size = max(len(t) for t in self.tgt)

    def __getitem__(self, index):
        x_sz = len(self.src[index])
        y_sz = len(self.tgt[index])
        if self.padding:
            x = self._pad_sequence(self.src[index], self.max_src_size)
            y = self._pad_sequence(self.tgt[index], self.max_tgt_size)
        else:
            x = self.src[index]
            y = self.tgt[index]
        return x,y, x_sz

    def __len__(self):
        return len(self.src)


    def _json2_k_v(self, json_file):
        ''' Convert JSON lang pairs to Key-Value lists with indexwise one2one correspondance
        '''
        with open(json_file, 'r', encoding = "utf-8") as f:
            data = json.load(f)

        x = []; y = []
        for k in data:
            for v in data[k]:
                x.append(k); y.append(v)

        return x, y


    def _pad_sequence(self, x, max_len):
        """ Pad sequence to maximum length;
        Pads zero if word < max
        Clip word if word > max
        """
        padded = np.zeros((max_len), dtype=np.int64)
        if len(x) > max_len: padded[:] = x[:max_len]
        else: padded[:len(x)] = x
        return padded

    def _char_class_weights(self, x_list, scale = 10):
        """For handling class imbalance in the characters
        Return: 1D-tensor will be fed to CEloss weights for error calculation
        """
        from collections import Counter
        full_list = []
        for x in x_list:
            full_list += list(x)
        count_dict = dict(Counter(full_list))

        class_weights = np.ones(self.tgt_glyph.size(), dtype = np.float32)
        
        for k in count_dict:
            class_weights[k] = (1/count_dict[k]) * scale

        return class_weights

### Merge JSON

In [158]:
def merge_xlit_jsons(filepath_list, save_prefix = ""):
    """
    Merge JSON files into single file wrt keys
    """
    data_list = []
    for fpath in filepath_list:
        with open(fpath, 'r', encoding = "utf-8") as f:
            data_list.append(json.load(f))

    whole_dict = dict()
    for dat in data_list:
        for dk in dat:
            whole_dict[dk] = set()

    for dat in data_list:
        for dk in dat:
            whole_dict[dk].update(dat[dk])

    for k in whole_dict:
        whole_dict[k] = list(whole_dict[k])

    print("Total Key count:", len(whole_dict))
    save_path = save_prefix+"merged_file.json"
    with open(save_path,"w", encoding = "utf-8") as f:
        json.dump(whole_dict, f, ensure_ascii=False, indent=4, sort_keys=True,)

    return save_path

#Utilities

In [159]:
def LOG2CSV(data, csv_file, flag = 'a'):
    '''
    data: List of elements to be written
    '''
    with open(csv_file, flag) as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(data)
    csvFile.close()

### Weights related utils

In [160]:
def load_pretrained(model, weight_path, device, flexible = False):
    if not weight_path:
        return model

    pretrain_dict = torch.load(weight_path) if device == 'cuda' else torch.load(weight_path, map_location=torch.device('cpu'))
    model_dict = model.state_dict()
    if flexible:
        pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in model_dict}
    print("Pretrained layers:", pretrain_dict.keys())
    model_dict.update(pretrain_dict)
    model.load_state_dict(model_dict)

    return model

def count_train_param(model):
    train_params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The model has {} trainable parameters'.format(train_params_count))
    return train_params_count

def freeze_params(model, exclusion_list = []):
    ## TODO: Exclusion lists
    for param in model.parameters():
        param.requires_grad = False
    return model


### Accuracy Estimation

In [161]:
def accuracy_score(pred_tnsr, tgt_tnsr, glyph_obj):
    '''Simple accuracy calculation for char2char seq TRAINING phase
    pred_tnsr: torch tensor :shp: (batch, voc_size, seq_len)
    tgt_tnsr: torch tensor :shp: (batch, seq_len)
    '''
    pred_seq = torch.argmax(pred_tnsr, dim=1)
    batch_sz = pred_seq.shape[0]
    crt_cnt = 0
    for i in range(batch_sz):
        pred = glyph_obj.xlitvec2word(pred_seq[i,:].cpu().numpy())
        tgt = glyph_obj.xlitvec2word(tgt_tnsr[i,:].cpu().numpy())
        if pred == tgt:
            crt_cnt += 1
    return torch.tensor(crt_cnt/batch_sz)

#Training Stage

##Configuration

In [224]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

INST_NAME = "Training_2"
LOG_PATH = INST_NAME + "/"
WGT_PREFIX = LOG_PATH+"weights/"+INST_NAME
if not os.path.exists(LOG_PATH+"weights"): os.makedirs(LOG_PATH+"weights")

Hyperparameters

In [241]:
num_epochs = 5
batch_size = 512  # Remember to run data objects creation on changing this
acc_grad = 1
learning_rate = 1e-3
teacher_forcing, teach_force_till, teach_decay_pereph = 1, 20, 0
pretrain_wgt_path = 'Training_2/weights/Training_2_model.pth'

Network Architecture

In [242]:
input_dim = src_glyph.size() + 1
output_dim = tgt_glyph.size() + 1
enc_emb_dim = 300
dec_emb_dim = 300
enc_hidden_dim = 512
dec_hidden_dim = 512
rnn_type = "lstm"
enc2dec_hid = True
attention = True
enc_layers = 1
dec_layers = 2
m_dropout = 0
enc_bidirect = True
enc_outstate_dim = enc_hidden_dim * (2 if enc_bidirect else 1)

### Instantiation

Dataset objects creation

In [226]:
from torch.utils.data import DataLoader

# train_file = merge_xlit_jsons(["data/hindi/HiEn_train1.json",
#                                 "data/hindi/HiEn_train2.json" ],
#                                 save_prefix= LOG_PATH)

train_dataset = XlitData(src_glyph_obj = src_glyph, tgt_glyph_obj = tgt_glyph,
                        json_file=TRAIN_FILE, file_map = "LangEn",
                        padding=True)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                shuffle=True, num_workers=0)

# val_dataset = XlitData( src_glyph_obj = src_glyph, tgt_glyph_obj = tgt_glyph,
#                         json_file= VALID_FILE, file_map = "LangEn",
#                         padding=True)


# val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
#                                 shuffle=True, num_workers=0)

# for i in range(len(train_dataset)):
#     print(train_dataset.__getitem__(i))


Network Models creation

In [243]:
enc = Encoder(  input_dim= input_dim, embed_dim = enc_emb_dim,
                hidden_dim= enc_hidden_dim,
                rnn_type = rnn_type, layers= enc_layers,
                dropout= m_dropout, device = device,
                bidirectional= enc_bidirect)
dec = Decoder(  output_dim= output_dim, embed_dim = dec_emb_dim,
                hidden_dim= dec_hidden_dim,
                rnn_type = rnn_type, layers= dec_layers,
                dropout= m_dropout,
                use_attention = attention,
                enc_outstate_dim= enc_outstate_dim,
                device = device,)

model = Seq2Seq(enc, dec, pass_enc2dec_hid=enc2dec_hid,
                device=device)
model = model.to(device)

model = load_pretrained(model, pretrain_wgt_path, device) #if path empty returns unmodified

## ----- Load Embeds -----
### For Loading charecter embedding from pretrained fasttext model

# hi_emb_vecs = np.load("hi_char_fasttext.npy")
# model.decoder.embedding.weight.data.copy_(torch.from_numpy(hi_emb_vecs))

# en_emb_vecs = np.load("en_char_fasttext.npy")
# model.encoder.embedding.weight.data.copy_(torch.from_numpy(en_emb_vecs))

Pretrained layers: odict_keys(['encoder.embedding.weight', 'encoder.enc_rnn.weight_ih_l0', 'encoder.enc_rnn.weight_hh_l0', 'encoder.enc_rnn.bias_ih_l0', 'encoder.enc_rnn.bias_hh_l0', 'encoder.enc_rnn.weight_ih_l0_reverse', 'encoder.enc_rnn.weight_hh_l0_reverse', 'encoder.enc_rnn.bias_ih_l0_reverse', 'encoder.enc_rnn.bias_hh_l0_reverse', 'decoder.embedding.weight', 'decoder.dec_rnn.weight_ih_l0', 'decoder.dec_rnn.weight_hh_l0', 'decoder.dec_rnn.bias_ih_l0', 'decoder.dec_rnn.bias_hh_l0', 'decoder.dec_rnn.weight_ih_l1', 'decoder.dec_rnn.weight_hh_l1', 'decoder.dec_rnn.bias_ih_l1', 'decoder.dec_rnn.bias_hh_l1', 'decoder.fc.0.weight', 'decoder.fc.0.bias', 'decoder.fc.2.weight', 'decoder.fc.2.bias', 'decoder.W1.weight', 'decoder.W1.bias', 'decoder.W2.weight', 'decoder.W2.bias', 'decoder.V.weight', 'decoder.V.bias'])


In [233]:
count_train_param(model)
print(model)

The model has 10180521 trainable parameters
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(58, 300)
    (enc_rnn): LSTM(300, 512, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(36, 300)
    (dec_rnn): LSTM(1324, 512, num_layers=2, batch_first=True)
    (fc): Sequential(
      (0): Linear(in_features=512, out_features=300, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
      (2): Linear(in_features=300, out_features=36, bias=True)
    )
    (W1): Linear(in_features=1024, out_features=512, bias=True)
    (W2): Linear(in_features=512, out_features=512, bias=True)
    (V): Linear(in_features=512, out_features=1, bias=True)
  )
)


## Optimization Setup

In [234]:

criterion = torch.nn.CrossEntropyLoss()
    # weight = torch.from_numpy(train_dataset.tgt_class_weights).to(device)  )  ## For class balancing during training

def loss_estimator(pred, truth):
    """ Only consider non-zero inputs in the loss; mask needed
    pred: batch
    """
    mask = truth.ge(1).type(torch.FloatTensor).to(device)
    loss_ = criterion(pred, truth) * mask
    return torch.mean(loss_)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate,
                             weight_decay=0)

# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)


## Run Training

In [235]:
best_loss = float("inf")
best_accuracy = 0
for epoch in range(num_epochs):

    #-------- Training -------------------
    model.train()
    acc_loss = 0
    running_loss = []
    if epoch >= teach_force_till: teacher_forcing = 0
    else: teacher_forcing = max(0, teacher_forcing - teach_decay_pereph)

    for ith, (src, tgt, src_sz) in enumerate(train_dataloader):

        src = src.to(device)
        tgt = tgt.to(device)

        #--- forward ------
        output = model(src = src, tgt = tgt, src_sz =src_sz,
                       teacher_forcing_ratio = teacher_forcing)
        loss = loss_estimator(output, tgt) / acc_grad
        acc_loss += loss

        #--- backward ------
        loss.backward()
        if ( (ith+1) % acc_grad == 0):
            optimizer.step()
            optimizer.zero_grad()

            print('epoch[{}/{}], MiniBatch-{} loss:{:.4f}'
                .format(epoch+1, num_epochs, (ith+1)//acc_grad, acc_loss.data))
            running_loss.append(acc_loss.item())
            acc_loss=0
            # break

    LOG2CSV(running_loss, LOG_PATH+"trainLoss.csv")

    #--------- Validate ---------------------
    model.eval()
    val_loss = 0
    val_accuracy = 0
    for jth, (v_src, v_tgt, v_src_sz) in enumerate(tqdm(train_dataloader)):
        v_src = v_src.to(device)
        v_tgt = v_tgt.to(device)
        with torch.no_grad():
            v_output = model(src = v_src, tgt = v_tgt, src_sz = v_src_sz)
            val_loss += loss_estimator(v_output, v_tgt)

            val_accuracy += accuracy_score(v_output, v_tgt, tgt_glyph) # in Utils section
        # break
    val_loss = val_loss / len(train_dataloader)
    val_accuracy = val_accuracy / len(train_dataloader)

    print('epoch[{}/{}], [-----TEST------] loss:{:.4f}  Accur:{:.4f}'
            .format(epoch+1, num_epochs, val_loss.data, val_accuracy.data))
    LOG2CSV([val_loss.item(), val_accuracy.item()],
                LOG_PATH+"valLoss.csv")

    #-------- save Checkpoint -------------------
    if val_accuracy > best_accuracy:
    # if val_loss < best_loss:
        print("***saving best optimal state [Loss:{} Accur:{}] ***".format(val_loss.data,val_accuracy.data) )
        best_loss = val_loss
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), WGT_PREFIX+"_model.pth")
        LOG2CSV([epoch+1, val_loss.item(), val_accuracy.item()],
                LOG_PATH+"bestCheckpoint.csv")

    # LR step
    # scheduler.step()

epoch[1/5], MiniBatch-1 loss:0.0152
epoch[1/5], MiniBatch-2 loss:0.0152
epoch[1/5], MiniBatch-3 loss:0.0155
epoch[1/5], MiniBatch-4 loss:0.0151
epoch[1/5], MiniBatch-5 loss:0.0156
epoch[1/5], MiniBatch-6 loss:0.0156
epoch[1/5], MiniBatch-7 loss:0.0154
epoch[1/5], MiniBatch-8 loss:0.0154
epoch[1/5], MiniBatch-9 loss:0.0156
epoch[1/5], MiniBatch-10 loss:0.0158
epoch[1/5], MiniBatch-11 loss:0.0151
epoch[1/5], MiniBatch-12 loss:0.0154
epoch[1/5], MiniBatch-13 loss:0.0154
epoch[1/5], MiniBatch-14 loss:0.0159
epoch[1/5], MiniBatch-15 loss:0.0158
epoch[1/5], MiniBatch-16 loss:0.0158
epoch[1/5], MiniBatch-17 loss:0.0156
epoch[1/5], MiniBatch-18 loss:0.0159
epoch[1/5], MiniBatch-19 loss:0.0154
epoch[1/5], MiniBatch-20 loss:0.0158
epoch[1/5], MiniBatch-21 loss:0.0153
epoch[1/5], MiniBatch-22 loss:0.0157
epoch[1/5], MiniBatch-23 loss:0.0161
epoch[1/5], MiniBatch-24 loss:0.0154
epoch[1/5], MiniBatch-25 loss:0.0156
epoch[1/5], MiniBatch-26 loss:0.0151
epoch[1/5], MiniBatch-27 loss:0.0157
epoch[1/5]

epoch[1/5], MiniBatch-221 loss:0.0153
epoch[1/5], MiniBatch-222 loss:0.0155
epoch[1/5], MiniBatch-223 loss:0.0157
epoch[1/5], MiniBatch-224 loss:0.0151
epoch[1/5], MiniBatch-225 loss:0.0156
epoch[1/5], MiniBatch-226 loss:0.0158
epoch[1/5], MiniBatch-227 loss:0.0159
epoch[1/5], MiniBatch-228 loss:0.0162
epoch[1/5], MiniBatch-229 loss:0.0155
epoch[1/5], MiniBatch-230 loss:0.0160
epoch[1/5], MiniBatch-231 loss:0.0157
epoch[1/5], MiniBatch-232 loss:0.0163
epoch[1/5], MiniBatch-233 loss:0.0156
epoch[1/5], MiniBatch-234 loss:0.0155
epoch[1/5], MiniBatch-235 loss:0.0155
epoch[1/5], MiniBatch-236 loss:0.0161
epoch[1/5], MiniBatch-237 loss:0.0155
epoch[1/5], MiniBatch-238 loss:0.0162
epoch[1/5], MiniBatch-239 loss:0.0158
epoch[1/5], MiniBatch-240 loss:0.0159
epoch[1/5], MiniBatch-241 loss:0.0159
epoch[1/5], MiniBatch-242 loss:0.0157
epoch[1/5], MiniBatch-243 loss:0.0158
epoch[1/5], MiniBatch-244 loss:0.0161
epoch[1/5], MiniBatch-245 loss:0.0159
epoch[1/5], MiniBatch-246 loss:0.0161
epoch[1/5], 

100%|██████████| 381/381 [00:42<00:00,  8.89it/s]


epoch[1/5], [-----TEST------] loss:0.0339  Accur:0.9260
***saving best optimal state [Loss:0.033886540681123734 Accur:0.92596435546875] ***
epoch[2/5], MiniBatch-1 loss:0.0155
epoch[2/5], MiniBatch-2 loss:0.0155
epoch[2/5], MiniBatch-3 loss:0.0156
epoch[2/5], MiniBatch-4 loss:0.0155
epoch[2/5], MiniBatch-5 loss:0.0156
epoch[2/5], MiniBatch-6 loss:0.0154
epoch[2/5], MiniBatch-7 loss:0.0156
epoch[2/5], MiniBatch-8 loss:0.0155
epoch[2/5], MiniBatch-9 loss:0.0157
epoch[2/5], MiniBatch-10 loss:0.0153
epoch[2/5], MiniBatch-11 loss:0.0157
epoch[2/5], MiniBatch-12 loss:0.0153
epoch[2/5], MiniBatch-13 loss:0.0150
epoch[2/5], MiniBatch-14 loss:0.0153
epoch[2/5], MiniBatch-15 loss:0.0158
epoch[2/5], MiniBatch-16 loss:0.0156
epoch[2/5], MiniBatch-17 loss:0.0155
epoch[2/5], MiniBatch-18 loss:0.0149
epoch[2/5], MiniBatch-19 loss:0.0153
epoch[2/5], MiniBatch-20 loss:0.0151
epoch[2/5], MiniBatch-21 loss:0.0153
epoch[2/5], MiniBatch-22 loss:0.0157
epoch[2/5], MiniBatch-23 loss:0.0156
epoch[2/5], MiniBa

epoch[2/5], MiniBatch-217 loss:0.0159
epoch[2/5], MiniBatch-218 loss:0.0156
epoch[2/5], MiniBatch-219 loss:0.0155
epoch[2/5], MiniBatch-220 loss:0.0159
epoch[2/5], MiniBatch-221 loss:0.0157
epoch[2/5], MiniBatch-222 loss:0.0160
epoch[2/5], MiniBatch-223 loss:0.0153
epoch[2/5], MiniBatch-224 loss:0.0157
epoch[2/5], MiniBatch-225 loss:0.0158
epoch[2/5], MiniBatch-226 loss:0.0158
epoch[2/5], MiniBatch-227 loss:0.0155
epoch[2/5], MiniBatch-228 loss:0.0157
epoch[2/5], MiniBatch-229 loss:0.0161
epoch[2/5], MiniBatch-230 loss:0.0157
epoch[2/5], MiniBatch-231 loss:0.0155
epoch[2/5], MiniBatch-232 loss:0.0155
epoch[2/5], MiniBatch-233 loss:0.0163
epoch[2/5], MiniBatch-234 loss:0.0160
epoch[2/5], MiniBatch-235 loss:0.0161
epoch[2/5], MiniBatch-236 loss:0.0160
epoch[2/5], MiniBatch-237 loss:0.0158
epoch[2/5], MiniBatch-238 loss:0.0156
epoch[2/5], MiniBatch-239 loss:0.0157
epoch[2/5], MiniBatch-240 loss:0.0157
epoch[2/5], MiniBatch-241 loss:0.0156
epoch[2/5], MiniBatch-242 loss:0.0155
epoch[2/5], 

100%|██████████| 381/381 [00:43<00:00,  8.79it/s]


epoch[2/5], [-----TEST------] loss:0.0322  Accur:0.9331
***saving best optimal state [Loss:0.032230038195848465 Accur:0.9331367611885071] ***
epoch[3/5], MiniBatch-1 loss:0.0152
epoch[3/5], MiniBatch-2 loss:0.0153
epoch[3/5], MiniBatch-3 loss:0.0150
epoch[3/5], MiniBatch-4 loss:0.0154
epoch[3/5], MiniBatch-5 loss:0.0151
epoch[3/5], MiniBatch-6 loss:0.0151
epoch[3/5], MiniBatch-7 loss:0.0152
epoch[3/5], MiniBatch-8 loss:0.0157
epoch[3/5], MiniBatch-9 loss:0.0151
epoch[3/5], MiniBatch-10 loss:0.0154
epoch[3/5], MiniBatch-11 loss:0.0157
epoch[3/5], MiniBatch-12 loss:0.0153
epoch[3/5], MiniBatch-13 loss:0.0154
epoch[3/5], MiniBatch-14 loss:0.0156
epoch[3/5], MiniBatch-15 loss:0.0154
epoch[3/5], MiniBatch-16 loss:0.0151
epoch[3/5], MiniBatch-17 loss:0.0153
epoch[3/5], MiniBatch-18 loss:0.0153
epoch[3/5], MiniBatch-19 loss:0.0155
epoch[3/5], MiniBatch-20 loss:0.0157
epoch[3/5], MiniBatch-21 loss:0.0152
epoch[3/5], MiniBatch-22 loss:0.0155
epoch[3/5], MiniBatch-23 loss:0.0153
epoch[3/5], Mini

epoch[3/5], MiniBatch-217 loss:0.0155
epoch[3/5], MiniBatch-218 loss:0.0153
epoch[3/5], MiniBatch-219 loss:0.0150
epoch[3/5], MiniBatch-220 loss:0.0155
epoch[3/5], MiniBatch-221 loss:0.0153
epoch[3/5], MiniBatch-222 loss:0.0152
epoch[3/5], MiniBatch-223 loss:0.0159
epoch[3/5], MiniBatch-224 loss:0.0163
epoch[3/5], MiniBatch-225 loss:0.0159
epoch[3/5], MiniBatch-226 loss:0.0155
epoch[3/5], MiniBatch-227 loss:0.0161
epoch[3/5], MiniBatch-228 loss:0.0154
epoch[3/5], MiniBatch-229 loss:0.0157
epoch[3/5], MiniBatch-230 loss:0.0157
epoch[3/5], MiniBatch-231 loss:0.0156
epoch[3/5], MiniBatch-232 loss:0.0155
epoch[3/5], MiniBatch-233 loss:0.0154
epoch[3/5], MiniBatch-234 loss:0.0158
epoch[3/5], MiniBatch-235 loss:0.0159
epoch[3/5], MiniBatch-236 loss:0.0158
epoch[3/5], MiniBatch-237 loss:0.0160
epoch[3/5], MiniBatch-238 loss:0.0159
epoch[3/5], MiniBatch-239 loss:0.0158
epoch[3/5], MiniBatch-240 loss:0.0157
epoch[3/5], MiniBatch-241 loss:0.0160
epoch[3/5], MiniBatch-242 loss:0.0158
epoch[3/5], 

100%|██████████| 381/381 [00:42<00:00,  8.98it/s]


epoch[3/5], [-----TEST------] loss:0.0330  Accur:0.9289
epoch[4/5], MiniBatch-1 loss:0.0152
epoch[4/5], MiniBatch-2 loss:0.0149
epoch[4/5], MiniBatch-3 loss:0.0152
epoch[4/5], MiniBatch-4 loss:0.0155
epoch[4/5], MiniBatch-5 loss:0.0152
epoch[4/5], MiniBatch-6 loss:0.0154
epoch[4/5], MiniBatch-7 loss:0.0151
epoch[4/5], MiniBatch-8 loss:0.0152
epoch[4/5], MiniBatch-9 loss:0.0152
epoch[4/5], MiniBatch-10 loss:0.0152
epoch[4/5], MiniBatch-11 loss:0.0153
epoch[4/5], MiniBatch-12 loss:0.0152
epoch[4/5], MiniBatch-13 loss:0.0155
epoch[4/5], MiniBatch-14 loss:0.0152
epoch[4/5], MiniBatch-15 loss:0.0157
epoch[4/5], MiniBatch-16 loss:0.0156
epoch[4/5], MiniBatch-17 loss:0.0157
epoch[4/5], MiniBatch-18 loss:0.0151
epoch[4/5], MiniBatch-19 loss:0.0153
epoch[4/5], MiniBatch-20 loss:0.0149
epoch[4/5], MiniBatch-21 loss:0.0153
epoch[4/5], MiniBatch-22 loss:0.0156
epoch[4/5], MiniBatch-23 loss:0.0156
epoch[4/5], MiniBatch-24 loss:0.0150
epoch[4/5], MiniBatch-25 loss:0.0151
epoch[4/5], MiniBatch-26 los

epoch[4/5], MiniBatch-219 loss:0.0158
epoch[4/5], MiniBatch-220 loss:0.0154
epoch[4/5], MiniBatch-221 loss:0.0161
epoch[4/5], MiniBatch-222 loss:0.0155
epoch[4/5], MiniBatch-223 loss:0.0158
epoch[4/5], MiniBatch-224 loss:0.0156
epoch[4/5], MiniBatch-225 loss:0.0157
epoch[4/5], MiniBatch-226 loss:0.0155
epoch[4/5], MiniBatch-227 loss:0.0153
epoch[4/5], MiniBatch-228 loss:0.0157
epoch[4/5], MiniBatch-229 loss:0.0156
epoch[4/5], MiniBatch-230 loss:0.0157
epoch[4/5], MiniBatch-231 loss:0.0157
epoch[4/5], MiniBatch-232 loss:0.0157
epoch[4/5], MiniBatch-233 loss:0.0160
epoch[4/5], MiniBatch-234 loss:0.0156
epoch[4/5], MiniBatch-235 loss:0.0156
epoch[4/5], MiniBatch-236 loss:0.0160
epoch[4/5], MiniBatch-237 loss:0.0156
epoch[4/5], MiniBatch-238 loss:0.0159
epoch[4/5], MiniBatch-239 loss:0.0160
epoch[4/5], MiniBatch-240 loss:0.0153
epoch[4/5], MiniBatch-241 loss:0.0157
epoch[4/5], MiniBatch-242 loss:0.0154
epoch[4/5], MiniBatch-243 loss:0.0160
epoch[4/5], MiniBatch-244 loss:0.0158
epoch[4/5], 

100%|██████████| 381/381 [00:41<00:00,  9.10it/s]


epoch[4/5], [-----TEST------] loss:0.0331  Accur:0.9327
epoch[5/5], MiniBatch-1 loss:0.0156
epoch[5/5], MiniBatch-2 loss:0.0151
epoch[5/5], MiniBatch-3 loss:0.0155
epoch[5/5], MiniBatch-4 loss:0.0158
epoch[5/5], MiniBatch-5 loss:0.0154
epoch[5/5], MiniBatch-6 loss:0.0156
epoch[5/5], MiniBatch-7 loss:0.0155
epoch[5/5], MiniBatch-8 loss:0.0154
epoch[5/5], MiniBatch-9 loss:0.0156
epoch[5/5], MiniBatch-10 loss:0.0152
epoch[5/5], MiniBatch-11 loss:0.0157
epoch[5/5], MiniBatch-12 loss:0.0152
epoch[5/5], MiniBatch-13 loss:0.0155
epoch[5/5], MiniBatch-14 loss:0.0152
epoch[5/5], MiniBatch-15 loss:0.0151
epoch[5/5], MiniBatch-16 loss:0.0149
epoch[5/5], MiniBatch-17 loss:0.0155
epoch[5/5], MiniBatch-18 loss:0.0155
epoch[5/5], MiniBatch-19 loss:0.0152
epoch[5/5], MiniBatch-20 loss:0.0152
epoch[5/5], MiniBatch-21 loss:0.0154
epoch[5/5], MiniBatch-22 loss:0.0159
epoch[5/5], MiniBatch-23 loss:0.0155
epoch[5/5], MiniBatch-24 loss:0.0155
epoch[5/5], MiniBatch-25 loss:0.0157
epoch[5/5], MiniBatch-26 los

epoch[5/5], MiniBatch-219 loss:0.0153
epoch[5/5], MiniBatch-220 loss:0.0158
epoch[5/5], MiniBatch-221 loss:0.0158
epoch[5/5], MiniBatch-222 loss:0.0157
epoch[5/5], MiniBatch-223 loss:0.0155
epoch[5/5], MiniBatch-224 loss:0.0152
epoch[5/5], MiniBatch-225 loss:0.0156
epoch[5/5], MiniBatch-226 loss:0.0158
epoch[5/5], MiniBatch-227 loss:0.0161
epoch[5/5], MiniBatch-228 loss:0.0155
epoch[5/5], MiniBatch-229 loss:0.0156
epoch[5/5], MiniBatch-230 loss:0.0156
epoch[5/5], MiniBatch-231 loss:0.0158
epoch[5/5], MiniBatch-232 loss:0.0156
epoch[5/5], MiniBatch-233 loss:0.0154
epoch[5/5], MiniBatch-234 loss:0.0160
epoch[5/5], MiniBatch-235 loss:0.0159
epoch[5/5], MiniBatch-236 loss:0.0157
epoch[5/5], MiniBatch-237 loss:0.0156
epoch[5/5], MiniBatch-238 loss:0.0157
epoch[5/5], MiniBatch-239 loss:0.0155
epoch[5/5], MiniBatch-240 loss:0.0156
epoch[5/5], MiniBatch-241 loss:0.0154
epoch[5/5], MiniBatch-242 loss:0.0157
epoch[5/5], MiniBatch-243 loss:0.0159
epoch[5/5], MiniBatch-244 loss:0.0157
epoch[5/5], 

100%|██████████| 381/381 [00:42<00:00,  8.89it/s]

epoch[5/5], [-----TEST------] loss:0.0349  Accur:0.9280





# Inference & Evaluation



## Functions

###JSON handling

In [210]:
def save_to_json(path, data_dict):
    with open(path ,"w", encoding = "utf-8") as f:
        json.dump(data_dict, f, ensure_ascii=False, indent=4, sort_keys=True,)


def toggle_json(read_path, save_prefix=""):
    with open(read_path, 'r', encoding = "utf-8") as f:
        data = json.load(f)

    tog_dict = dict()
    for d in data.keys():
        for v in data[d]:
            tog_dict[v] = set()

    for d in data.keys():
        for v in data[d]:
            tog_dict[v].add(d)

    for t in tog_dict.keys():
        tog_dict[t] = list(tog_dict[t])

    save_file = save_prefix+"/Toggled-"+ os.path.basename(read_path)
    with open(save_file,"w", encoding = "utf-8") as f:
        json.dump(tog_dict, f, ensure_ascii=False, indent=4, sort_keys=True,)

    return save_file


def get_from_json(path, ret_data = "key"):
    with open(path, 'r', encoding = "utf-8") as f:
        data = json.load(f)

    if ret_data == "key":
        out = list(data.keys())
    elif ret_data == "value":
        temp = data.values()
        temp = { i for t in temp for i in t }
        out = list(temp)
    elif ret_data == "both":
        out = []
        for k in data.keys():
            for v in data[k]:
                out.append([k,v])
    return sorted(out)

###Inference Functions

Reranking routine based on monolingual vocabulary

In [211]:
class VocabSanitizer():
    '''
    Sanitize topK vocab prediction using ancillary vocab list
    by reranking or removing etc
    '''
    def __init__(self, data_file):
        '''
        data_file: path to file conatining vocabulary list
        '''
        extension = os.path.splitext(data_file)[-1]
        if extension == ".json":
            self.vocab_set  = set( json.load(open(data_file)) )
        elif extension == ".csv":
            self.vocab_df = pd.read_csv(data_file).set_index('WORD')
            self.vocab_set = set( self.vocab_df.index )
        else:
            print("Only Json/CSV file extension supported")


    def remove_astray(self, word_list):
        '''Remove words that are not present in vocabulary
        '''
        new_list = []
        for v in word_list:
            if v in self.vocab_set:
                new_list.append(v)
        if new_list == []:
            return word_list.copy()
            # return [" "]
        return new_list

    def reposition(self, word_list):
        '''Reorder Words in list
        '''
        new_list = []
        temp_ = word_list.copy()
        for v in word_list:
            if v in self.vocab_set:
                new_list.append(v)
                temp_.remove(v)
        new_list.extend(temp_)

        return new_list

Inference runner

In [212]:
def inferencer(word, topk = 10):
    in_vec = torch.from_numpy(src_glyph.word2xlitvec(word)).to(device)
    ## change to active or passive beam
    p_out_list = model.active_beam_inference(in_vec, beam_width = topk)
    p_result = [ tgt_glyph.xlitvec2word(out.cpu().numpy()) for out in p_out_list]

    result = p_result
    # result = voc_sanitize.reposition(p_result) ## Uncomment for repositioning

    return result

def inference_looper(in_words, topk = 3):
    out_dict = {}
    for i in tqdm(in_words):
        out_dict[i] = inferencer(i, topk=topk)
    return out_dict

##Inferencing

In [244]:
device = 'cpu'
WGT_PATH = INST_NAME+"/weights/"+INST_NAME+"_model.pth"

SAVE_DIR = LOG_PATH + "/acc_log/"
if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR)

# voc_sanitize = VocabSanitizer("checkup_words_sorted.json") #Monolingual based topK sorting

In [None]:
#Loading Accuracy Computing script
!wget https://raw.githubusercontent.com/AI4Bharat/IndianNLP-Transliteration/jgeob-dev/tools/accuracy_reporter/accuracy_news.py

In [248]:
tfi =  toggle_json(TEST_FILE, save_prefix=SAVE_DIR)
words = get_from_json(tfi, "key")
model.to(device)
out_dict = inference_looper(words, topk = 1)

pred_path = os.path.join(SAVE_DIR, "pred_"+os.path.basename(TEST_FILE) )
save_to_json(pred_path, out_dict)

gt_json = tfi
pred_json = pred_path
save_prefix = os.path.join(SAVE_DIR, os.path.basename(TEST_FILE).replace(".json", ""))

for topk in [1]:
    ## GT json file passed to below script must be in { En(input): [NativeLang (predict)] } format
    run_accuracy_news = "( echo {} && python accuracy_news.py --gt-json {} --pred-json {} --topk {} --save-output-csv {}_top{}-scores.csv ) | tee -a {}/Summary.txt".format(
                    os.path.basename(TRAIN_FILE),
                    gt_json, pred_json, topk,
                    save_prefix, topk, SAVE_DIR )

    os.system(run_accuracy_news)

100%|██████████| 194/194 [19:26<00:00,  6.01s/it]

train_dataset.json

TOP 1 SCORES FOR 194 SAMPLES:
ACC:          0.953608
Mean F-score: 0.991101
MRR:          0.953608
MAP_ref:      0.953608






In [261]:
model.to(device)
tfi =  toggle_json(TEST_FILE, save_prefix='')
words = get_from_json(tfi, "key")[:10]
out_dict = inference_looper(['بويكوت'], topk = 1)
out_dict

100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


{'بويكوت': ['boykot']}

#Zip

In [None]:
# Compress Logs anad Model for Download
!zip -r j2r_10_epoch.zip {INST_NAME}

In [264]:
import pickle

with open("jawi_j2r_glyph.pkl", "wb") as file:
    pickle.dump(src, file)
    
with open("rumi_j2r_glyph.pkl", "wb") as file:
    pickle.dump(tgt_glyph, file)