In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import librosa
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
from data_processing import *

In [3]:
pairs = prepare_pairs_mfcc("../data/hin_cleaned_25k.txt", "../data/mfcc_features")
print(len(pairs))
print(torch.cuda.is_available())

24676
True


In [4]:
train_test_split = 0.8

split_index = int(train_test_split*len(pairs))
train_pairs = pairs[:split_index]
test_pairs = pairs[split_index:]

In [5]:
max_source_length = 0
max_target_length = 0
for mfcc, text in pairs:
    if max_target_length < len(text.split()):
        max_target_length =len(text.split())
    
    if max_source_length < mfcc.shape[0]:
        max_source_length = mfcc.shape[0]


In [6]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 30

In [7]:
hindi_vocab = load_embeddings("../data/cleaned_hin_word_vec.txt")

for token in hindi_vocab:
    hindi_vocab[token] = np.array( hindi_vocab[token])

hindi_vocab["<SOS>"] = np.random.rand(EMBEDDING_DIM)
hindi_vocab["<EOS>"] = np.random.rand(EMBEDDING_DIM)
hindi_vocab["<UNK>"] = np.random.rand(EMBEDDING_DIM)

In [8]:
MAX_LENGTH = max_source_length + 20

In [9]:
token_to_ix = {}
ix_to_token = {}

for token in hindi_vocab:
    ix_to_token[len(token_to_ix)] = token
    token_to_ix[token] = len(token_to_ix)
    
def index2token(index):
    if index in ix_to_token:
        return ix_to_token[index]
    else:
        return "<UNK>"
    
def token2index(token):
    if token in token_to_ix:
        return token_to_ix[token]
    else:
        return token_to_ix["<UNK>"]
        
def token2embed(token):
    if token in hindi_vocab:
        return hindi_vocab[token]
    else:
        return hindi_vocab["<UNK>"]
        

In [11]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, inputs, hidden):
        output = inputs.view(1, 1, -1)
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size, dtype=torch.float, device=device),
                torch.zeros(1, 1, self.hidden_size, dtype=torch.float, device=device))

In [13]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)         
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, embedded, hidden, encoder_outputs):
        embedded = embedded.view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size, dtype=torch.float, device=device),
                torch.zeros(1, 1, self.hidden_size, dtype=torch.float, device=device))

In [14]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    eps=10e-7
    now = time.time()
    s = now - since
    es = s / (percent + eps)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))



In [15]:
def indexesFromSentence(sentence):
    return [token2index(word) for word in sentence.split(' ')]


def tensorsFromSentence(sentence):
    indexes = indexesFromSentence(sentence)
    indexes.append(token2index("<EOS>"))
    return torch.tensor(indexes, dtype=torch.long, device=device, requires_grad = False).view(-1, 1)


def tensorsFromPair(pair):
    target_tensor = tensorsFromSentence(pair[1])
    input_tensor = torch.from_numpy(pair[0].astype('float32')).to(device)
    return (input_tensor, target_tensor)

In [16]:
def train(input_tensor, target_tensor, encoder, attn_decoder, encoder_optimizer, attn_decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()
    
    encoder_optimizer.zero_grad()
    attn_decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, dtype = torch.float, device=device)

    loss = 0
        
    for ei in range(input_length):    
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    attn_decoder_input = torch.tensor([[token2index("<SOS>")]], dtype = torch.long, device=device)

    attn_decoder_hidden = encoder_hidden
    

    for di in range(target_length):
        
        attn_decoder_output, attn_decoder_hidden = attn_decoder(
            torch.tensor(token2embed(index2token(attn_decoder_input)), dtype =torch.float, device = device), attn_decoder_hidden,encoder_outputs)
        topv, topi = attn_decoder_output.topk(1)
        attn_decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(attn_decoder_output, target_tensor[di])
        if attn_decoder_input.item() == token2index("<EOS>"):
            break

    loss.backward()

    encoder_optimizer.step()
    attn_decoder_optimizer.step()

    return loss.item() / target_length


In [17]:
def trainIters(pairs, encoder, attn_decoder, n_iters, print_every=1, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    attn_decoder_optimizer = optim.Adam(attn_decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(i)
                      for i in pairs] #making a list of all training pairs
    criterion = nn.NLLLoss()

    for iter in range(n_iters):
        for j in range(len(training_pairs)):
            training_pair = training_pairs[j]
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]
    
            loss = train(input_tensor, target_tensor, encoder,
                         attn_decoder, encoder_optimizer, attn_decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss
            if j%1000==0:
                print(j,"samples trained over")
            
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))
"""
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
"""

'\n        if iter % plot_every == 0:\n            plot_loss_avg = plot_loss_total / plot_every\n            plot_losses.append(plot_loss_avg)\n            plot_loss_total = 0\n\n    showPlot(plot_losses)\n'

In [18]:
lr = 3e-3
encoder = Encoder(13, HIDDEN_DIM).to(device)
attn_decoder = AttnDecoder(HIDDEN_DIM, len(hindi_vocab)).to(device)

n_iters = 3

In [None]:
trainIters(train_pairs, encoder, attn_decoder, n_iters, print_every=1, plot_every=1, learning_rate=lr)
torch.save(encoder,"../data/encoder.pt")
torch.save(attn_decoder,"../data/attn_decoder.pt")