In [1]:
import numpy as np 
import random 
import os 
import torch 
from utils import _load_vocab,_load_config
from torch.utils.data import Dataset,DataLoader
import librosa
import numpy as np 
import pandas as pd 
import librosa.display
import ast
import random

In [2]:
VOCAB_DICT=_load_vocab()
config_file=_load_config()

In [3]:
class AudioDataset(Dataset):
    def __init__(self,csv_file,loss_fn,n_mfcc=13):
        self.data=pd.read_csv(csv_file)
        self.dir_path=os.path.dirname(csv_file)
        self.vocab_dict=VOCAB_DICT
        self.n_mfcc=n_mfcc
        self.loss_fn=loss_fn 

    def __len__(self):
        return len(self.data)
    
    def char_to_idx(self,transcript):
        
        one_hot=torch.zeros(len(transcript),len(self.vocab_dict))
        for i,char in enumerate(transcript):
            one_hot[i,self.vocab_dict[char]]=1 
        
        return one_hot
    
    def compute_mfcc(self,audio_path):

        y,sr=librosa.load(audio_path)

        n_fft = min(2048, len(y))
        hop_length = n_fft // 4

        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc, n_fft=n_fft, hop_length=hop_length)

        width = min(9, mfccs.shape[1])
        if width < 3:
            width = 3
        
        width = min(width, mfccs.shape[1])

        if width % 2 == 0:
            width -= 1

        
        delta1=librosa.feature.delta(mfccs,order=1,width=width)
        delta2=librosa.feature.delta(mfccs,order=2,width=width)

        mfccs_combined=np.concatenate((mfccs,delta1,delta2),axis=0)

        return mfccs_combined
    
    def __getitem__(self,idx):
        
        audio_path_x1=self.data["audio_path"][idx]
        audio_path_x1=os.path.join(self.dir_path,str(audio_path_x1))
        transcript_c1=self.data["transcript"][idx]

        sample_list=ast.literal_eval(self.data["negative_samples"][idx])
        
        one_hot_c2=None
        if 0 in self.loss_fn or 1 in self.loss_fn:
            transcript_c2=random.choice(sample_list)[1]
            one_hot_c2=self.char_to_idx(transcript_c2)
        
        mfccs_x2=None
        if 2 in self.loss_fn or 3 in self.loss_fn:
            audio_path_x2=random.choice(sample_list)[0]
            audio_path_x2=os.path.join(self.dir_path,str(audio_path_x2))

            mfccs_x2=self.compute_mfcc(audio_path_x2)
        
        #computing mfcc
        mfcc_x1=self.compute_mfcc(audio_path_x1) 
        
        #computiing one hot for transcript 
        one_hot_c1=self.char_to_idx(transcript_c1)

        output_tensors= [torch.tensor(mfcc_x1), one_hot_c1]

        if 0 in self.loss_fn or 1 in self.loss_fn:
            output_tensors.append(one_hot_c2)
    
        if 2 in self.loss_fn or 3 in self.loss_fn:
            output_tensors.append(torch.tensor(mfccs_x2))
    
        return output_tensors


In [4]:
def pad_mfccs(mfccs, max_len):
    padded_mfccs = []
    for mfcc in mfccs:
        # Padding to the right with zeros
        pad_width = max_len - mfcc.shape[1]
        padded_mfcc = torch.nn.functional.pad(mfcc, (0, pad_width), 'constant', 0)
        padded_mfccs.append(padded_mfcc)
    return torch.stack(padded_mfccs)

def pad_sequence(sequences, batch_first=False, padding_value=0):
    return torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)

def collate_fn(batch):

    mfccs_x1= []
    one_hot_c1=[]
    one_hot_c2=[]
    mfccs_x2=[]

    for item in batch:
        mfcc_x1,oh_c1 = item[0], item[1]
        mfccs_x1.append(mfcc_x1)
        one_hot_c1.append(oh_c1)

        if len(item) == 4:
            oh_c2,mfcc_x2= item[2], item[3]
            one_hot_c2.append(oh_c2)
            mfccs_x2.append(mfcc_x2)
        
        elif len(item) == 3:
            if item[2].shape[1] == len(VOCAB_DICT):
                oh_c2=item[2]
                one_hot_c2.append(oh_c2)
            else:
                mfcc_x2=item[2]
                mfccs_x2.append(mfcc_x2)
    
    max_mfcc_len_x1=max(mfcc.shape[1] for mfcc in mfccs_x1)
    max_mfcc_len_x2=max(mfcc.shape[1] for mfcc in mfccs_x2) if mfccs_x2 else 0

    mfccs_x1=pad_mfccs(mfccs_x1,max_mfcc_len_x1)
    one_hot_c1=pad_sequence(one_hot_c1, batch_first=True)

    result={'view1_x1':mfccs_x1,'view2_c1':one_hot_c1}

    if one_hot_c2:
        one_hot_c2=pad_sequence(one_hot_c2,batch_first=True)
        result['view2_c2']=one_hot_c2
    
    if mfccs_x2:
        mfccs_x2=pad_mfccs(mfccs_x2, max_mfcc_len_x2)
        result['view1_x2']=mfccs_x2
    
    return result 
            

In [5]:
csv_file='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/sampled_metadata.csv'
dataset=AudioDataset(csv_file=csv_file,loss_fn=config_file["loss_fn"])

dataloader=DataLoader(dataset, batch_size=3, collate_fn=collate_fn)


In [6]:
len(dataset[0])

3

In [7]:
data=iter(dataloader)

batch=next(data)

In [8]:
batch.keys()

dict_keys(['view1_x1', 'view2_c1', 'view2_c2'])

In [9]:
batch['view1_x1']=batch['view1_x1'].view(3,45,39)
batch['view1_x1'].shape

torch.Size([3, 45, 39])

In [10]:
batch['view2_c1'].shape

torch.Size([3, 4, 83])

In [11]:
batch['view2_c2'].shape

torch.Size([3, 5, 83])

In [12]:
from model import MultiViewRNN
model=MultiViewRNN(config_file)

In [13]:
model

MultiViewRNN(
  (net): ModuleDict(
    (view1): RNN_default(
      (rnn): LSTM(39, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
    (view2): RNN_default(
      (rnn): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
  )
)

In [14]:
out=model(batch)

In [15]:
out.keys()

dict_keys(['x2', 'c2', 'x1', 'c1'])

In [16]:
out['x1'].shape

torch.Size([3, 1024])

In [17]:
out['c1'].shape

torch.Size([3, 1024])

In [18]:
out['c2'].shape

torch.Size([3, 1024])

In [19]:
from loss import contrastive_loss
loss=contrastive_loss(obj=config_file["loss_fn"],margin=config_file["margin"],x1=out['x1'],c1=out['c1'],c2=out['c2'])

In [20]:
loss

tensor(0.5061, grad_fn=<AddBackward0>)

In [1]:
import pandas as pd 
from itertools import combinations 

In [13]:
metadata=pd.read_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/sampled_metadata.csv')
grouped = metadata.groupby('transcript')

In [17]:
gr_list=list(grouped['audio_path'])

In [18]:
gr_list

[('अपने',
  1    0116_003_segment_1.wav
  Name: audio_path, dtype: object),
 ('अप्पा',
  42    0139_003_segment_7.wav
  Name: audio_path, dtype: object),
 ('उन्हें',
  12    0128_003_segment_2.wav
  Name: audio_path, dtype: object),
 ('उसे',
  52    0153_003_segment_3.wav
  Name: audio_path, dtype: object),
 ('एक',
  30    0136_003_segment_8.wav
  53    0153_003_segment_4.wav
  Name: audio_path, dtype: object),
 ('और',
  0      0116_003_segment_0.wav
  32    0136_003_segment_10.wav
  Name: audio_path, dtype: object),
 ('कर',
  18    0128_003_segment_8.wav
  Name: audio_path, dtype: object),
 ('कहा',
  39    0139_003_segment_4.wav
  Name: audio_path, dtype: object),
 ('की',
  5     0116_003_segment_5.wav
  15    0128_003_segment_5.wav
  Name: audio_path, dtype: object),
 ('के',
  43    0139_003_segment_8.wav
  Name: audio_path, dtype: object),
 ('को',
  3    0116_003_segment_3.wav
  Name: audio_path, dtype: object),
 ('कोलम',
  54    0153_003_segment_5.wav
  Name: audio_path, dtype: obj

In [19]:
gr_dict=dict(gr_list)

In [31]:
type(metadata)

pandas.core.frame.DataFrame

In [38]:
unique_words=metadata['transcript'].unique()

In [39]:
labels=[]

In [54]:
audio_pairs=combinations(metadata['audio_path'],2)

In [55]:
audio_pairs

<itertools.combinations at 0x7696c11eb9c0>

In [59]:
for pair in audio_pairs:
    audio1,audio2=pair
    transcript1=gr_dict[audio1].values[0]
    transcript2=gr_dict[audio2].values[0]

    if transcript1==transcript2:
        labels.append(1)
    else:
        labels.append(0)
labels=np.array(labels)    

KeyError: '0116_003_segment_0.wav'