In [7]:
import pandas as pd 

train_df=pd.read_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/metadata - Sheet1.csv')

In [8]:
dev_df = train_df.sample(frac=0.3, random_state=42)
train_set = train_df.drop(dev_df.index)
train_set.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
train_set.to_csv('/home/ubuntu/acoustic        # Pad the inner lists to have the same number of sequences
_stuff/hindi-acoustic-word-embedding/dataset/train_set.csv', index=False)
dev_df.to_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/dev_set.csv', index=False)



In [9]:
dev_df['transcript'].head()

0      और
1      की
2     गोल
3    मछली
4      से
Name: transcript, dtype: object

In [10]:
train_set.head()

Unnamed: 0.1,Unnamed: 0,audio_path,transcript
0,1,0116_003_segment_1.wav,अपने
1,2,0116_003_segment_2.wav,पेट
2,7,0116_003_segment_7.wav,गरमगरम
3,9,0116_003_segment_9.wav,हड़पते
4,10,0128_003_segment_0.wav,मुनिया


In [11]:
import Levenshtein
import pandas as pd 
import random 
import os 

def edit_distance(word1,word2):
    distance=Levenshtein.distance(word1,word2)
    return distance

def sample_words_with_lev_scores(df, lev_score, word):
    filtered_words = df[df.apply(lambda row: edit_distance(row['transcript'], word), axis=1) == lev_score]
    
    if len(filtered_words) >= 2:
        sampled_words = random.sample(list(filtered_words['transcript']), 2)
    else:
        sampled_words = list(filtered_words['transcript'])
    
    return sampled_words

def sample_dev_words(path):
    df=pd.read_csv(path)
    sampled_list=[]
    
    for i in range(len(df)):
        out_dict={}
        for j in range(len(df)):
            lev_distance=edit_distance(df['transcript'][i],df['transcript'][j])
            if lev_distance in out_dict:
                out_dict[lev_distance].append(df['transcript'][j])
            else:
                out_dict[lev_distance]=[df['transcript'][j]]
        
        sampled_list.append(out_dict)
    """"for i in range(len(df)):
        score=0
        out_dict={}
        count_list=[]
        while len(count_list)<8:
            lev_score_i_words = sample_words_with_lev_scores(df, score, df['transcript'][i])
            out_dict[score]=lev_score_i_words
            
            count_list+=lev_score_i_words
            score+=1

        sampled_list.append(out_dict)"""

    df['sampled_words']=sampled_list

    root_path=path.split('dataset/')[0] + 'dataset/'
    save_path=os.path.join(root_path,"sampled_devset.csv")

    df.to_csv(save_path)

    return df 

In [12]:
from sampler import sample_negatives

sampled_trainset=sample_negatives('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_set.csv')
sampled_dev_set=sample_dev_words('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/dev_set.csv')

# Dev Loader

In [13]:
import numpy as np 
import random, time, operator 
import os 
import torch 
from utils import _load_vocab
from utils import load_audio
from torch.utils.data import Dataset,DataLoader
import librosa
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import librosa.display
import ast 

VOCAB_DICT=_load_vocab()

In [14]:
class MultiviewDevDataset(Dataset):

    def __init__(self,csv_file,n_mfcc=13):
        self.data=pd.read_csv(csv_file)
        self.dir_path=os.path.dirname(csv_file)
        self.vocab_dict=VOCAB_DICT
        self.n_mfcc=n_mfcc

    def __len__(self):
        return len(self.data)
    
    def char_to_idx(self,transcript):
        
        one_hot=torch.zeros(len(transcript),len(self.vocab_dict))
        for i,char in enumerate(transcript):
            one_hot[i,self.vocab_dict[char]]=1 
        
        return one_hot
    
    def compute_mfcc(self,audio_path):

        y,sr=librosa.load(audio_path)

        n_fft = min(2048, len(y))
        hop_length = n_fft // 4

        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc, n_fft=n_fft, hop_length=hop_length)

        width = min(9, mfccs.shape[1])
        if width < 3:
            width = 3
        
        width = min(width, mfccs.shape[1])

        if width % 2 == 0:
            width -= 1

        
        delta1=librosa.feature.delta(mfccs,order=1,width=width)
        delta2=librosa.feature.delta(mfccs,order=2,width=width)

        mfccs_combined=np.concatenate((mfccs,delta1,delta2),axis=0)

        return mfccs_combined
    
    def __getitem__(self,idx):
        
        audio_path_x1=self.data["audio_path"][idx]
        audio_path_x1=os.path.join(self.dir_path,str(audio_path_x1))

        #mfcc 
        audio_mfcc=self.compute_mfcc(audio_path_x1)

        sample_dict=ast.literal_eval(self.data["sampled_words"][idx])
        lev_scores=[]
        for score in sample_dict.keys():
            for _ in range(len(sample_dict[score])):
                lev_scores.append(score)

        one_hot=[]
        for transcripts in sample_dict.values():
            for transcript in transcripts:
                one_hot.append(self.char_to_idx(transcript))
        
        output_tensor=[torch.tensor(audio_mfcc),one_hot,torch.tensor(lev_scores)]

        return output_tensor

In [15]:
dev_dataset=MultiviewDevDataset('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/sampled_devset.csv')

In [25]:
dev_dataset[0][2]


tensor([0, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 1, 8, 9])

In [50]:
def pad_sequence(sequences, batch_first=True, padding_value=0):
    return torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)

def pad_mfccs(mfccs, max_len):
    padded_mfccs = []
    for mfcc in mfccs:
        # Padding to the right with zeros
        pad_width = max_len - mfcc.shape[1]
        padded_mfcc = torch.nn.functional.pad(mfcc, (0, pad_width), 'constant', 0)
        padded_mfccs.append(padded_mfcc)
    return torch.stack(padded_mfccs)

def pad_list_of_lists(batch_of_sequences, padding_value=0):
    padded_batch = []
    for sequences in batch_of_sequences:
        sequences = [torch.tensor(seq) for seq in sequences]
        padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=padding_value)
        padded_batch.append(padded_sequences)
    
    return torch.stack(padded_batch)

In [30]:
out_tensor=pad_sequence(dev_dataset[0][1])


In [31]:
out_tensor.shape

torch.Size([18, 9, 83])

In [52]:
def dev_collate_fn(batch):
    
    mfccs_x1=[]
    one_hot=[]
    lev_scores=[]

    for item in batch:
        mfcc_x1,oh,lev_score=item[0],item[1],item[2]
        mfccs_x1.append(mfcc_x1)
        one_hot.append(oh)
        lev_scores.append(lev_score)
    
    max_mfcc_len_x1=max(mfcc.shape[1] for mfcc in mfccs_x1)
    mfccs_x1=pad_mfccs(mfccs_x1,max_mfcc_len_x1)

    one_hot=pad_list_of_lists(one_hot)

    results={"mfcc":mfccs_x1,"sampled_one_hot":one_hot,"lev_scores":torch.stack(lev_scores)}

    return results

In [55]:
dev_loader=DataLoader(dev_dataset, batch_size=2, collate_fn=dev_collate_fn)


In [56]:
batches=iter(dev_loader)
batch=next(batches)

  sequences = [torch.tensor(seq) for seq in sequences]


In [58]:
batch["sampled_one_hot"].shape

torch.Size([2, 18, 9, 83])

In [61]:
from utils import _load_config

In [62]:
from model import MultiViewRNN

config_file=_load_config()
model=MultiViewRNN(config_file)

In [63]:
model

MultiViewRNN(
  (net): ModuleDict(
    (view1): RNN_default(
      (rnn): LSTM(39, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
    (view2): RNN_default(
      (rnn): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
  )
)

In [75]:
input_tensor=batch["sampled_one_hot"]
input_tensor=input_tensor.view(input_tensor.shape[0]*18,input_tensor.shape[2],input_tensor.shape[3])

In [76]:
input_tensor.shape

torch.Size([36, 9, 83])

In [81]:
input_one_hot={"view2_c1":input_tensor}
out_one_hot=model(input_one_hot)["c1"]

In [82]:
out_one_hot.shape

torch.Size([36, 1024])

In [83]:
out_one_hot=out_one_hot.view(2,18,1024)

In [84]:
out_one_hot.shape 

torch.Size([2, 18, 1024])

In [86]:
mfcc=batch["mfcc"]
mfcc=mfcc.view(-1,mfcc.shape[2],mfcc.shape[1])
mfcc_input={"view1_x1":mfcc}
audio_emb=model(mfcc_input)["x1"]


In [88]:
audio_emb.shape

torch.Size([2, 1024])

In [89]:
sampled_text_emb=out_one_hot

In [91]:
sampled_text_emb.shape

torch.Size([2, 18, 1024])

In [92]:
audio_emb=audio_emb.unsqueeze(1)

In [99]:
audio_emb=audio_emb.squeeze(1)
audio_emb.shape

torch.Size([2, 1024])

In [94]:
from metrics import ranked_batch_ap

In [98]:
audio_emb.shape

torch.Size([2, 1, 1024])

In [100]:
import torch.nn.functional as F 

normalized_audio_embeddings = F.normalize(audio_emb, p=2, dim=1)  
normalized_text_embeddings = F.normalize(sampled_text_emb, p=2, dim=2)

In [101]:
expanded_audio_embeddings = normalized_audio_embeddings.unsqueeze(1)
cosine_similarities = torch.sum(expanded_audio_embeddings * normalized_text_embeddings, dim=2)

In [102]:
cosine_similarities.shape 

torch.Size([2, 18])

In [103]:
indices=torch.argsort(cosine_similarities,dim=1)

In [104]:
indices.shape

torch.Size([2, 18])

In [105]:
indices

tensor([[12,  5,  6, 16,  1, 14,  4,  7, 15,  8,  2, 11,  3, 13, 17,  0, 10,  9],
        [11,  7,  6,  9, 10,  8, 14, 15,  2,  3,  1, 17,  5,  0,  4, 16, 12, 13]])

In [109]:
lev_score=torch.stack(batch["lev_scores"])

In [110]:
ap=ranked_batch_ap(lev_score,indices)

In [111]:
ap

0.24358975887298584

In [1]:
import pandas as pd 
from dataset import MultiviewDevDataset
dataset_path='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_devset.csv'
dataset=MultiviewDevDataset(dataset_path)

In [8]:
len(dataset[0][1][19])

3

In [17]:
import torch 
batch=[[torch.randn(2,83),torch.randn(3,83)],[torch.randn(2,83),torch.randn(4,83)],[torch.randn(1,83),torch.randn(5,83)]]



In [19]:
import torch
from torch.nn.utils.rnn import pad_sequence

def pad_batch_sequence(batch, padding_value=0):
    padded_batch = []

    max_seq_length = max(len(seq) for sequences in batch for seq in sequences)
    max_num_sequences = max(len(sequences) for sequences in batch)
    
    for sequences in batch:
        padded_sequences = [
            torch.nn.functional.pad(torch.tensor(seq), (0, 0, 0, max_seq_length - len(seq)), 'constant', padding_value)
            for seq in sequences
        ]
        
        padded_sequences = pad_sequence(padded_sequences, batch_first=True, padding_value=padding_value)
        
        if len(padded_sequences) < max_num_sequences:
            padding = torch.full((max_num_sequences - len(padded_sequences), max_seq_length, padded_sequences.shape[2]), padding_value)
            padded_sequences = torch.cat((padded_sequences, padding), dim=0)
        
        padded_batch.append(padded_sequences)
    
    stacked_tensor = torch.stack(padded_batch)

    return stacked_tensor

In [20]:
padded=pad_batch_sequence(batch)

  torch.nn.functional.pad(torch.tensor(seq), (0, 0, 0, max_seq_length - len(seq)), 'constant', padding_value)


In [21]:
padded.shape

torch.Size([3, 2, 5, 83])

In [1]:
from dataset import get_dev_loader

dev_loader=get_dev_loader('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_devset.csv',32)

In [2]:
batch=iter(dev_loader)


In [3]:
batch_element=next(batch)

  torch.nn.functional.pad(torch.tensor(seq), (0, 0, 0, max_seq_length - len(seq)), 'constant', padding_value)


In [7]:
batch_element['lev_scores'].shape

torch.Size([32, 20])

In [8]:
from model import MultiViewRNN
from utils import _load_config
config_file=_load_config()
model=MultiViewRNN(config_file)

In [9]:
mfcc=batch_element["mfcc"]
mfcc=mfcc.view(-1,mfcc.shape[2],mfcc.shape[1])
mfcc_input={"view1_x1":mfcc}
audio_emb=model(mfcc_input)["x1"] 

input_text_tensor=batch_element["sampled_one_hot"]
batch_size=input_text_tensor.shape[0]
sampled_shape=input_text_tensor.shape[1]
input_text_tensor=input_text_tensor.view(input_text_tensor.shape[0]*sampled_shape,
                                                        input_text_tensor.shape[2],
                                                        input_text_tensor.shape[3])
input_one_hot={"view2_c1":input_text_tensor}
out_one_hot=model(input_one_hot)["c1"]
text_emb=out_one_hot.view(batch_size,
                                    sampled_shape,
                                    out_one_hot.shape[1])
lev_distances=batch_element["lev_scores"]

In [10]:
audio_emb.shape

torch.Size([32, 1024])

In [11]:
text_emb.shape

torch.Size([32, 20, 1024])

In [12]:
lev_distances.shape

torch.Size([32, 20])

In [20]:
import torch  
import torch.nn.functional as F 

def get_indices(audio_embedding,text_embedding):
    
    normalized_audio_embeddings = F.normalize(audio_embedding, p=2, dim=1)
    normalized_text_embeddings = F.normalize(text_embedding, p=2, dim=2)

    expanded_audio_embeddings = normalized_audio_embeddings.unsqueeze(1)

    cosine_similarities = torch.sum(expanded_audio_embeddings * normalized_text_embeddings, dim=2)

    del normalized_audio_embeddings
    del normalized_text_embeddings
    del expanded_audio_embeddings
    torch.cuda.empty_cache() 

    indices=torch.argsort(cosine_similarities,dim=1)

    del cosine_similarities
    torch.cuda.empty_cache()

    return indices 

"""def crossview_ap(audio_embedding,text_embedding,lev_distances):

    indices=get_indices(audio_embedding=audio_embedding,text_embedding=text_embedding)
    
    average_precission=ranked_batch_ap(lev_distances,indices)

    return average_precission

def ranked_batch_ap(lev_distances, cosine_ranks):
    
    relevant_ranks = cosine_ranks.masked_select(lev_distances == 0).sort()[0]
    device=relevant_ranks.device

    pos_indices = torch.arange(1, relevant_ranks.size(0) + 1,device=device).float()

    precision_at_k = pos_indices / (relevant_ranks.float() + 1)

    batch_ap = precision_at_k.sum() / relevant_ranks.size(0)
    return batch_ap.item()"""

'def crossview_ap(audio_embedding,text_embedding,lev_distances):\n\n    indices=get_indices(audio_embedding=audio_embedding,text_embedding=text_embedding)\n    \n    average_precission=ranked_batch_ap(lev_distances,indices)\n\n    return average_precission\n\ndef ranked_batch_ap(lev_distances, cosine_ranks):\n    \n    relevant_ranks = cosine_ranks.masked_select(lev_distances == 0).sort()[0]\n    device=relevant_ranks.device\n\n    pos_indices = torch.arange(1, relevant_ranks.size(0) + 1,device=device).float()\n\n    precision_at_k = pos_indices / (relevant_ranks.float() + 1)\n\n    batch_ap = precision_at_k.sum() / relevant_ranks.size(0)\n    return batch_ap.item()'

In [66]:
import torch  
import torch.nn.functional as F 

def crossview_ap(audio_embedding,text_embedding,lev_distances):

    normalized_audio_embeddings = F.normalize(audio_embedding, p=2, dim=1)
    normalized_text_embeddings = F.normalize(text_embedding, p=2, dim=2)

    expanded_audio_embeddings = normalized_audio_embeddings.unsqueeze(1)
    
    cosine_similarities = torch.sum(expanded_audio_embeddings * normalized_text_embeddings, dim=2)

    #freeing the memory 
    del normalized_audio_embeddings
    del normalized_text_embeddings
    del expanded_audio_embeddings
    torch.cuda.empty_cache() 

    indices=torch.argsort(cosine_similarities,dim=1)

    average_precission=ranked_batch_ap(lev_distances,indices)

    del cosine_similarities
    torch.cuda.empty_cache() 

    return average_precission



In [67]:
ranked_ap=crossview_ap(audio_embedding=audio_emb,text_embedding=text_emb,lev_distances=lev_distances)

In [68]:
ranked_ap

0.13431194424629211

In [31]:
indices=get_indices(audio_embedding=audio_emb,text_embedding=text_emb)

In [32]:
indices 

tensor([[15,  5, 16,  3, 17,  9,  1,  0, 12, 13, 19, 11, 14,  6,  2, 10,  8, 18,
          7,  4],
        [17, 14, 12, 19,  5, 15,  3,  9,  8, 18,  6, 11,  2, 10, 16, 13,  4,  1,
          0,  7],
        [13, 17,  0,  3, 14,  6,  5,  7,  2, 19, 16,  1, 18,  9, 15,  8, 12,  4,
         10, 11],
        [ 2,  0, 11,  4,  7, 13,  5, 10, 12,  6, 18,  8, 14,  1, 19, 15, 16, 17,
          3,  9],
        [10, 18, 17,  9, 14,  7, 19,  1,  5,  0,  4, 15, 12, 16,  3,  2, 11, 13,
          8,  6],
        [15, 13,  7,  6,  9, 17,  5, 18,  3, 16,  8,  4,  2, 12,  1, 14, 10,  0,
         19, 11],
        [ 5,  1, 14, 18,  6, 19,  8, 13,  7, 10,  4,  2,  3,  0, 15,  9, 17, 16,
         11, 12],
        [11,  3,  2, 16,  0,  9,  7,  5, 15, 12, 17,  8,  1,  6, 10, 13,  4, 19,
         14, 18],
        [14, 13, 17,  3,  6, 11,  1,  5,  0, 19,  8,  7, 18,  2,  9,  4, 10, 12,
         15, 16],
        [13,  3, 17, 12,  2, 15, 16, 18,  6,  5, 14,  4,  1, 11, 10,  0, 19,  9,
          8,  7],
        [1

In [33]:
lev_distances

tensor([[0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5],
        [0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
        [0, 0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
        [0, 0, 1, 2,

In [36]:
relevant_ranks = indices.masked_select(lev_distances == 0)


In [37]:
relevant_ranks

tensor([15,  5, 17, 14, 13, 17,  2, 10, 15, 13,  5, 11,  3, 14, 13,  3, 14, 16,
         6,  6, 12,  8, 16, 19, 12,  2, 16, 12, 13, 16, 12, 18,  6, 12,  5, 18,
        10, 16, 12, 14, 10,  7, 15, 12, 10,  2, 14,  3, 19, 10, 19,  3, 14])

In [38]:
relevant_ranks.size(0)

53

In [39]:
pos_indices=torch.arange(1,relevant_ranks.size(0)+1).float()

In [40]:
pos_indices

tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
        15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
        29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42.,
        43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53.])

In [56]:
rel_ranks=indices[0].masked_select(lev_distances[0]==0).sort(0)[0]

In [57]:
pos_indices=torch.arange(1,rel_ranks.size(0)+1).float()

In [58]:
pos_indices

tensor([1., 2.])

In [59]:
precision_at_k = pos_indices / (rel_ranks.float() + 1)


In [60]:
precision_at_k

tensor([0.1667, 0.1250])

In [65]:
def ranked_batch_ap(lev_distances, cosine_ranks):

    batch_ap=0.0
    num_elements=lev_distances.size(0)

    for i in range(num_elements):
        
        relevant_ranks=cosine_ranks[i].masked_select(lev_distances[i]==0).sort()[0]
        if relevant_ranks.numel()==0:
            continue 

        pos_indices=torch.arange(1,relevant_ranks.size(0)+1,device=relevant_ranks.device).float()
        precision_at_k=pos_indices/(relevant_ranks+1)

        average_precission_i=precision_at_k.sum()/relevant_ranks.size(0)
        batch_ap+=average_precission_i
    
    if num_elements>0:
        batch_ap/=num_elements
    
    return batch_ap.item()