In [7]:
import pandas as pd 

train_df=pd.read_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/metadata - Sheet1.csv')

In [8]:
dev_df = train_df.sample(frac=0.3, random_state=42)
train_set = train_df.drop(dev_df.index)
train_set.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
train_set.to_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_set.csv', index=False)
dev_df.to_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/dev_set.csv', index=False)



In [9]:
dev_df['transcript'].head()

0      और
1      की
2     गोल
3    मछली
4      से
Name: transcript, dtype: object

In [10]:
train_set.head()

Unnamed: 0.1,Unnamed: 0,audio_path,transcript
0,1,0116_003_segment_1.wav,अपने
1,2,0116_003_segment_2.wav,पेट
2,7,0116_003_segment_7.wav,गरमगरम
3,9,0116_003_segment_9.wav,हड़पते
4,10,0128_003_segment_0.wav,मुनिया


In [11]:
import Levenshtein
import pandas as pd 
import random 
import os 

def edit_distance(word1,word2):
    distance=Levenshtein.distance(word1,word2)
    return distance

def sample_words_with_lev_scores(df, lev_score, word):
    filtered_words = df[df.apply(lambda row: edit_distance(row['transcript'], word), axis=1) == lev_score]
    
    if len(filtered_words) >= 2:
        sampled_words = random.sample(list(filtered_words['transcript']), 2)
    else:
        sampled_words = list(filtered_words['transcript'])
    
    return sampled_words

def sample_dev_words(path):
    df=pd.read_csv(path)
    sampled_list=[]
    
    for i in range(len(df)):
        out_dict={}
        for j in range(len(df)):
            lev_distance=edit_distance(df['transcript'][i],df['transcript'][j])
            if lev_distance in out_dict:
                out_dict[lev_distance].append(df['transcript'][j])
            else:
                out_dict[lev_distance]=[df['transcript'][j]]
        
        sampled_list.append(out_dict)
    """"for i in range(len(df)):
        score=0
        out_dict={}
        count_list=[]
        while len(count_list)<8:
            lev_score_i_words = sample_words_with_lev_scores(df, score, df['transcript'][i])
            out_dict[score]=lev_score_i_words
            
            count_list+=lev_score_i_words
            score+=1

        sampled_list.append(out_dict)"""

    df['sampled_words']=sampled_list

    root_path=path.split('dataset/')[0] + 'dataset/'
    save_path=os.path.join(root_path,"sampled_devset.csv")

    df.to_csv(save_path)

    return df 

In [12]:
from sampler import sample_negatives

sampled_trainset=sample_negatives('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_set.csv')
sampled_dev_set=sample_dev_words('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/dev_set.csv')

# Dev Loader

In [13]:
import numpy as np 
import random, time, operator 
import os 
import torch 
from utils import _load_vocab
from utils import load_audio
from torch.utils.data import Dataset,DataLoader
import librosa
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import librosa.display
import ast 

VOCAB_DICT=_load_vocab()

In [14]:
class MultiviewDevDataset(Dataset):

    def __init__(self,csv_file,n_mfcc=13):
        self.data=pd.read_csv(csv_file)
        self.dir_path=os.path.dirname(csv_file)
        self.vocab_dict=VOCAB_DICT
        self.n_mfcc=n_mfcc

    def __len__(self):
        return len(self.data)
    
    def char_to_idx(self,transcript):
        
        one_hot=torch.zeros(len(transcript),len(self.vocab_dict))
        for i,char in enumerate(transcript):
            one_hot[i,self.vocab_dict[char]]=1 
        
        return one_hot
    
    def compute_mfcc(self,audio_path):

        y,sr=librosa.load(audio_path)

        n_fft = min(2048, len(y))
        hop_length = n_fft // 4

        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc, n_fft=n_fft, hop_length=hop_length)

        width = min(9, mfccs.shape[1])
        if width < 3:
            width = 3
        
        width = min(width, mfccs.shape[1])

        if width % 2 == 0:
            width -= 1

        
        delta1=librosa.feature.delta(mfccs,order=1,width=width)
        delta2=librosa.feature.delta(mfccs,order=2,width=width)

        mfccs_combined=np.concatenate((mfccs,delta1,delta2),axis=0)

        return mfccs_combined
    
    def __getitem__(self,idx):
        
        audio_path_x1=self.data["audio_path"][idx]
        audio_path_x1=os.path.join(self.dir_path,str(audio_path_x1))

        #mfcc 
        audio_mfcc=self.compute_mfcc(audio_path_x1)

        sample_dict=ast.literal_eval(self.data["sampled_words"][idx])
        lev_scores=[]
        for score in sample_dict.keys():
            for _ in range(len(sample_dict[score])):
                lev_scores.append(score)

        one_hot=[]
        for transcripts in sample_dict.values():
            for transcript in transcripts:
                one_hot.append(self.char_to_idx(transcript))
        
        output_tensor=[torch.tensor(audio_mfcc),one_hot,torch.tensor(lev_scores)]

        return output_tensor

In [15]:
dev_dataset=MultiviewDevDataset('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/sampled_devset.csv')

In [25]:
dev_dataset[0][2]


tensor([0, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 1, 8, 9])

In [50]:
def pad_sequence(sequences, batch_first=True, padding_value=0):
    return torch.nn.utils.rnn.pad_sequence(sequences, batch_first=batch_first, padding_value=padding_value)

def pad_mfccs(mfccs, max_len):
    padded_mfccs = []
    for mfcc in mfccs:
        # Padding to the right with zeros
        pad_width = max_len - mfcc.shape[1]
        padded_mfcc = torch.nn.functional.pad(mfcc, (0, pad_width), 'constant', 0)
        padded_mfccs.append(padded_mfcc)
    return torch.stack(padded_mfccs)

def pad_list_of_lists(batch_of_sequences, padding_value=0):
    padded_batch = []
    for sequences in batch_of_sequences:
        sequences = [torch.tensor(seq) for seq in sequences]
        padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=padding_value)
        padded_batch.append(padded_sequences)
    
    return torch.stack(padded_batch)

In [30]:
out_tensor=pad_sequence(dev_dataset[0][1])


In [31]:
out_tensor.shape

torch.Size([18, 9, 83])

In [52]:
def dev_collate_fn(batch):
    
    mfccs_x1=[]
    one_hot=[]
    lev_scores=[]

    for item in batch:
        mfcc_x1,oh,lev_score=item[0],item[1],item[2]
        mfccs_x1.append(mfcc_x1)
        one_hot.append(oh)
        lev_scores.append(lev_score)
    
    max_mfcc_len_x1=max(mfcc.shape[1] for mfcc in mfccs_x1)
    mfccs_x1=pad_mfccs(mfccs_x1,max_mfcc_len_x1)

    one_hot=pad_list_of_lists(one_hot)

    results={"mfcc":mfccs_x1,"sampled_one_hot":one_hot,"lev_scores":torch.stack(lev_scores)}

    return results

In [55]:
dev_loader=DataLoader(dev_dataset, batch_size=2, collate_fn=dev_collate_fn)


In [56]:
batches=iter(dev_loader)
batch=next(batches)

  sequences = [torch.tensor(seq) for seq in sequences]


In [58]:
batch["sampled_one_hot"].shape

torch.Size([2, 18, 9, 83])

In [61]:
from utils import _load_config

In [62]:
from model import MultiViewRNN

config_file=_load_config()
model=MultiViewRNN(config_file)

In [63]:
model

MultiViewRNN(
  (net): ModuleDict(
    (view1): RNN_default(
      (rnn): LSTM(39, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
    (view2): RNN_default(
      (rnn): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
  )
)

In [75]:
input_tensor=batch["sampled_one_hot"]
input_tensor=input_tensor.view(input_tensor.shape[0]*18,input_tensor.shape[2],input_tensor.shape[3])

In [76]:
input_tensor.shape

torch.Size([36, 9, 83])

In [81]:
input_one_hot={"view2_c1":input_tensor}
out_one_hot=model(input_one_hot)["c1"]

In [82]:
out_one_hot.shape

torch.Size([36, 1024])

In [83]:
out_one_hot=out_one_hot.view(2,18,1024)

In [84]:
out_one_hot.shape 

torch.Size([2, 18, 1024])

In [86]:
mfcc=batch["mfcc"]
mfcc=mfcc.view(-1,mfcc.shape[2],mfcc.shape[1])
mfcc_input={"view1_x1":mfcc}
audio_emb=model(mfcc_input)["x1"]


In [88]:
audio_emb.shape

torch.Size([2, 1024])

In [89]:
sampled_text_emb=out_one_hot

In [91]:
sampled_text_emb.shape

torch.Size([2, 18, 1024])

In [92]:
audio_emb=audio_emb.unsqueeze(1)

In [99]:
audio_emb=audio_emb.squeeze(1)
audio_emb.shape

torch.Size([2, 1024])

In [94]:
from metrics import ranked_batch_ap

In [98]:
audio_emb.shape

torch.Size([2, 1, 1024])

In [100]:
import torch.nn.functional as F 

normalized_audio_embeddings = F.normalize(audio_emb, p=2, dim=1)  
normalized_text_embeddings = F.normalize(sampled_text_emb, p=2, dim=2)

In [101]:
expanded_audio_embeddings = normalized_audio_embeddings.unsqueeze(1)
cosine_similarities = torch.sum(expanded_audio_embeddings * normalized_text_embeddings, dim=2)

In [102]:
cosine_similarities.shape 

torch.Size([2, 18])

In [103]:
indices=torch.argsort(cosine_similarities,dim=1)

In [104]:
indices.shape

torch.Size([2, 18])

In [105]:
indices

tensor([[12,  5,  6, 16,  1, 14,  4,  7, 15,  8,  2, 11,  3, 13, 17,  0, 10,  9],
        [11,  7,  6,  9, 10,  8, 14, 15,  2,  3,  1, 17,  5,  0,  4, 16, 12, 13]])

In [109]:
lev_score=torch.stack(batch["lev_scores"])

In [110]:
ap=ranked_batch_ap(lev_score,indices)

In [111]:
ap

0.24358975887298584