In [1]:
import librosa
import pandas as pd
import torch.optim as optim
import numpy as np
import csv
from data_load import TextTransform
import os
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
#from decoder import GreedyDecoder
from torch.functional import F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import torch.nn as nn
import librosa
from librosa.core import stft, magphase
from glob import glob
from torch import autograd
import csv
from data_load import CodeSwitchDataset
import zipfile

  for doc in docs:


In [2]:
def pad(wav, trans, lang):
    if lang == "Gujarati":
        max_len = 0
    elif lang == "Telugu":
        max_len = 529862
    elif lang == 'Tamil':
        max_len = 0
    else:
        raise Exception("Check Language")

    while len(wav) < max_len:
        diff = max_len - len(wav)
        ext = wav[:diff]
        wav = np.append(wav, wav[:diff])
        ratio = int(len(trans)*diff/len(wav))
        trans +=trans[:ratio]
    return wav, trans

def preprocess(data):
    #print(data)
    inputs = []
    labels = []
    input_lengths = []
    label_lengths = []
    
    for (wav, sr, trans, lang) in data:
        wav, trans  = pad(wav, trans, lang)
        out = stft(wav, win_length=int(sr*0.02), hop_length=int(sr*0.01))
        out = np.transpose(out, axes=(1, 0))

        text_transform = TextTransform()
        trans = torch.Tensor(text_transform.text_to_int(trans.lower()))

        out = magphase(out)[0]
        out = torch.from_numpy(np.array([np.log(1 + x) for x in out]))
        #print(out.shape)
        inputs.append(out)
        labels.append(trans)
        input_lengths.append(out.shape[0])
        label_lengths.append(len(trans))
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    #spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    return inputs, labels, input_lengths, label_lengths

In [3]:
def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=False):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    text_transform = TextTransform()
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

In [4]:
train_dataset = CodeSwitchDataset(lang = 'Telugu', mode = "train")
validation_split = .2
shuffle_dataset = True
random_seed= 42
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)


In [5]:
train_loader = DataLoader(train_dataset,
                          batch_size=8,
                          drop_last=True,
                          num_workers = 6,
                          sampler = train_sampler,
                         collate_fn = lambda x: preprocess(x))
test_loader = DataLoader(train_dataset,
                          batch_size=8,
                          drop_last=True,
                          num_workers = 6,
                          sampler = valid_sampler,
                         collate_fn = lambda x: preprocess(x))

In [6]:
device = torch.device('cuda')

In [7]:
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim = 29, num_layers = 4):
        super(Model, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        #self.drop = nn.Dropout(0.25)
        self.linear = nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, x):
        #print(type(self.lstm))
        #print(x.shape)
        lstm_out, hidden = self.lstm(x)
        #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.linear(lstm_out)
        return out, hidden

In [8]:
torch.cuda.empty_cache()

data_len = len(train_loader.dataset)
pbar = tqdm(enumerate(train_loader), total=len(train_loader))
for batch_idx, (_data) in pbar:
    #bi, wav, label = batch_idx, wav, label
    wav, labels, input_lengths, label_lengths = _data
    print(wav.shape)

labels

label_lengths

In [9]:
def train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, writer):
    model.train()
    data_len = len(train_loader.dataset)
    total_loss=0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for batch_idx, (_data) in pbar:
        #bi, wav, label = batch_idx, wav, label
        wav, labels, input_lengths, label_lengths = _data
        wav = wav.to(device)
        wav = wav.float()
        labels = labels.to(device)
        optimizer.zero_grad()
        #print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

        output, _ = model(wav)
        output = F.log_softmax(output, dim=1)
        output = output.transpose(0,1)
        #print(output.shape)
        loss = criterion(output, labels, input_lengths, label_lengths)
        #print(loss)
        total_loss+=loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        iter_meter.step()
        
        writer.add_scalar('Loss', loss, epoch*len(train_loader)+1)
        writer.add_scalar('TLoss', total_loss, epoch*len(train_loader)+1)
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(wav), data_len,
                    100. * batch_idx / len(train_loader), loss.item()))
    if (epoch+1)%2 == 0:
        model.eval().cpu()
        ckpt_model_filename = "ckpt_epoch_" + str(epoch+1) + "_batch_id_" + str(batch_idx+1) + ".pth"
        ckpt_model_path = os.path.join("checkpoints/", ckpt_model_filename)
        torch.save(model.state_dict(), ckpt_model_path)
        model.to(device).train()

In [10]:
def test(model, device, test_loader, criterion, epoch, writer):
    model.eval()
    training_loss, train_acc = 0, 0
    eer, total_eer = 0, 0
    test_loss=0
    with torch.no_grad():
        for batch_idx, _data in enumerate(test_loader):
            frr_far, eer = 0, 0
            inputs, labels, input_lengths, label_lengths = _data 
            inputs, labels = inputs.to(device), labels.to(device)

            output, _ = model(inputs)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=1)
            output = output.transpose(0, 1) # (time, batch, n_class)
            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
            for j in range(len(decoded_preds)):
                if len(set(decoded_preds[j])) != len(set(decoded_targets[j])):
                    frr_far+=1
            eer = frr_far/len(decoded_preds)
            total_eer+=eer
            writer.add_scalar('TLoss', total_eer, epoch*len(test_loader)+1)
            print("EER: ",eer)
            print("Total EER: ", total_eer)

In [11]:
class IterMeter(object):
    """keeps track of total iterations"""
    def __init__(self):
        self.val = 0

    def step(self):
        self.val += 1

    def get(self):
        return self.val

In [12]:
model = Model(input_dim=1025,
              hidden_dim=512,
              batch_size=8,
              output_dim=29,
              num_layers=4)
#model.half()
model = model.to(device)
criterion = nn.CTCLoss().to(device)
epochs = 40
optimizer = optim.Adam(model.parameters(), 5e-4)

scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-4, 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=40,
                                            anneal_strategy='linear')

In [13]:
checkpoints = torch.load('checkpoints/ckpt_epoch_38_batch_id_1699.pth')

In [14]:
model.load_state_dict(checkpoints)

<All keys matched successfully>

In [16]:
dataiter = iter(train_loader)

In [20]:
inputs, labels, input_lengths, label_lengths = dataiter.next()

In [40]:
labels

tensor([[21., 21.,  1.,  ...,  0.,  0.,  0.],
        [21., 21., 21.,  ...,  0.,  0.,  0.],
        [ 6.,  6.,  6.,  ...,  0.,  0.,  0.],
        ...,
        [21., 21., 21.,  ..., 21., 21., 21.],
        [21., 21., 21.,  ...,  0.,  0.,  0.],
        [21., 21., 21.,  ...,  0.,  0.,  0.]])

In [41]:
label_lengths

[183, 184, 184, 236, 239, 388, 178, 306]

In [72]:
input_lengths

[2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409]

In [23]:
model.eval()

Model(
  (lstm): LSTM(1025, 512, num_layers=4)
  (linear): Linear(in_features=512, out_features=29, bias=True)
)

In [30]:
out, _ = model(inputs.to(device))

In [39]:
out

tensor([[[ 3.2835,  1.3164, -0.1522,  ..., -0.1437, -0.1188, -0.1535],
         [ 3.2497,  0.4497,  0.3604,  ...,  0.3275,  0.3461,  0.3395],
         [ 3.3058,  1.5583, -0.3964,  ..., -0.3932, -0.3615, -0.3753],
         ...,
         [ 3.4495,  1.8411, -0.9489,  ..., -0.9906, -0.8959, -0.9429],
         [ 3.2477,  2.5675, -1.3726,  ..., -1.4047, -1.3332, -1.3775],
         [ 3.2150,  2.4898, -0.9548,  ..., -0.9558, -0.9080, -0.9752]],

        [[ 3.5076,  2.3428, -1.1568,  ..., -1.0859, -1.0337, -1.0074],
         [ 3.7234,  1.8242, -1.2044,  ..., -1.1536, -1.0829, -1.0382],
         [ 3.7028,  2.3307, -1.9615,  ..., -1.8326, -1.7163, -1.7376],
         ...,
         [ 2.1062,  2.3658, -4.3977,  ..., -4.3685, -4.3655, -4.2692],
         [ 1.7889,  3.0045, -4.6873,  ..., -4.6058, -4.6580, -4.5415],
         [ 2.3047,  2.1133, -3.0806,  ..., -3.0224, -3.0906, -2.9584]],

        [[ 3.6262,  1.9262, -0.1249,  ..., -0.1116, -0.1179,  0.0651],
         [ 3.6798,  2.2807, -0.7161,  ..., -0

In [43]:
ans = GreedyDecoder(out, labels, label_lengths)

In [73]:
ans[0][0]

"eeee 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'eee''eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee''                         e '        '        '''                                                   '   ''''   '  e tt              ttttt''''''ttttttttttttttttttttt  tt'  ''                 ttttttttttttt                      ttt'''''''''ttttttttttttt'''''      ''''''''                                                     ''ttttttttt''''t''''tt     ' e                 t                              ttttttttttttttttttttttttttttttttt''''''eee'                    ttttt ttttttttt'tttttt tt'''''''''''             tt     ttttttt'''e 'eeeeee'''tttt'ttttttttttttttttttt'''''             ee'    tttttttttt                  t                                    t         ttt'tttttttttttttt'ttt''''eeee'''''''''''t'ttttttttttttttttttttttte'''''''''''''''''''''''''''''''eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'eee''eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee''            

In [56]:
ans[1][0]

'tt tttt ttt tttttt tttttttt tttttt ttttttttttttttt ttttt tttttttt ttttttttttttttt tttt ttt tttttt tttttttt tttttt ttttttttttttttt ttttt tttttttt ttt tttt ttt tttttt tttttttt tttttt tt'

In [70]:
len(ans[0][1])

2409

In [68]:
set(ans[0][1])

{' ', "'", 'e', 't'}

In [71]:
len(ans[1][1])

184