In [1]:
import pandas as pd 

In [2]:
from model import MultiViewRNN
from utils import _load_config,load_checkpoint,_load_vocab
import torch.optim as optim 

config_file=_load_config()
model=MultiViewRNN(config_file)

In [3]:
optimizer = optim.Adam(model.parameters(), lr=config_file["lr"])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch 
def load_checkpoint(filepath, model, optimizer=None):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['state_dict'])
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer'])
    step = checkpoint['step']
    return model, optimizer, step

checkpoint_path='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/checkpoints2/checkpoint_epoch_97.pth.tar'
model,optimizer,step=load_checkpoint(checkpoint_path,model,optimizer)

In [5]:
model

MultiViewRNN(
  (net): ModuleDict(
    (view1): RNN_default(
      (rnn): LSTM(39, 512, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
    )
    (view2): RNN_default(
      (rnn): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
    )
  )
)

In [6]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0
)

In [11]:
step

32219

In [7]:
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(DEVICE)

In [31]:
model

MultiViewRNN(
  (net): ModuleDict(
    (view1): RNN_default(
      (rnn): LSTM(39, 512, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
    )
    (view2): RNN_default(
      (rnn): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
    )
  )
)

In [8]:
vocab_dict=_load_vocab()
def char_to_index(transcript):
    one_hot=torch.zeros(len(transcript),len(vocab_dict))
    for i,char in enumerate(transcript):
        one_hot[i,vocab_dict[char]]=1 
        
    return one_hot

In [9]:
import librosa
import numpy as np 
def compute_mfcc(audio_path,n_mfcc=13):
    y,sr=librosa.load(audio_path)

    n_fft = min(2048, len(y))
    hop_length = n_fft // 4

    mfccs = librosa.feature.mfcc(y=y, sr=sr,n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)

    width = min(9, mfccs.shape[1])
    if width < 3:
        width = 3
        
    width = min(width, mfccs.shape[1])

    if width % 2 == 0:
        width -= 1

        
    delta1=librosa.feature.delta(mfccs,order=1,width=width)
    delta2=librosa.feature.delta(mfccs,order=2,width=width)

    mfccs_combined=np.concatenate((mfccs,delta1,delta2),axis=0)
    

    return torch.tensor(mfccs_combined)

In [10]:
def prepare_input_audio(audio_path):
    
    mfcc=compute_mfcc(audio_path)
    mfcc=mfcc.view(mfcc.shape[1],mfcc.shape[0])
    mfcc=mfcc.to(DEVICE)
    mfcc=mfcc.unsqueeze(0)
    return {"view1_x1":mfcc}

def prepare_input_transcript(transcript):

    one_hot=char_to_index(transcript=transcript)
    one_hot=one_hot.to(DEVICE)
    one_hot=one_hot.unsqueeze(0)

    return {"view2_c1":one_hot}


In [14]:
csv_path='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_devset.csv'
df=pd.read_csv(csv_path)

In [14]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,audio_path,transcript,duration,sampled_words
0,0,4934,844424931200572-977-m_seg_1.wav,को,1.726-1.926 sec,"{0: ['को', 'को'], 1: ['हो', 'का'], 2: ['है', '..."
1,1,4013,844424930703499-329-f_seg_7.wav,हो,2.551-2.673 sec,"{0: ['हो', 'हो'], 1: ['को', 'है'], 2: ['एक', '..."
2,2,1056,844424933458787-572-m_seg_8.wav,है,2.560-2.926 sec,"{0: ['है', 'है'], 1: ['हो', 'ही'], 2: ['को', '..."
3,3,6619,844424933457774-1191-m_seg_0.wav,बैजू,0.000-1.893 sec,"{0: ['बैजू'], 2: ['बैठक', 'बैंक'], 3: ['है', '..."
4,4,17194,844424931170952-193-f_seg_2.wav,दैनिक,1.048-1.290 sec,"{0: ['दैनिक'], 3: ['दोनों', 'दिनों', 'देना', '..."


In [11]:
df_train=pd.read_csv('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_trainset.csv')

In [12]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,audio_path,transcript,duration,word_length,distances,negative_samples
0,20196,844424933460441-934-m_seg_4.wav,देशविशेष,2.296-2.940 sec,8,"[0, 6, 8, 8, 8, 7, 7, 9, 8, 8, 7, 8, 8, 9, 7, ...","[('844424932719302-126-m_seg_7.wav', 'दैनिक'),..."
1,11092,844424932719302-126-m_seg_7.wav,दैनिक,3.700-4.142 sec,5,"[6, 0, 7, 6, 6, 7, 5, 8, 8, 6, 5, 7, 6, 8, 5, ...","[('844424933460441-934-m_seg_4.wav', 'देशविशेष..."
2,21998,844424931188030-1133-m_seg_7.wav,मुहूर्त,3.252-3.613 sec,7,"[8, 7, 0, 7, 7, 9, 6, 6, 10, 6, 7, 6, 7, 7, 6,...","[('844424933460441-934-m_seg_4.wav', 'देशविशेष..."
3,8962,844424933563590-558-m_seg_3.wav,स्थानीय,1.933-2.537 sec,7,"[8, 6, 7, 0, 5, 8, 7, 8, 10, 6, 6, 8, 6, 7, 7,...","[('844424933460441-934-m_seg_4.wav', 'देशविशेष..."
4,795,844424931282533-277-f_seg_9.wav,ज्यादा,3.379-3.741 sec,6,"[8, 6, 7, 5, 0, 7, 6, 9, 10, 5, 6, 8, 6, 8, 7,...","[('844424933460441-934-m_seg_4.wav', 'देशविशेष..."


In [28]:
import random 
import ast 

choice=random.choice(ast.literal_eval(df_train['negative_samples'][10])
                    )
choice 


('844424932906135-1128-f_seg_1.wav', 'पुलिस')

In [35]:
import os 
root_path=os.path.dirname(csv_path)

audio_path=os.path.join(root_path,df_train['audio_path'][100])
#transcript=choice[1]
transcript=df_train['transcript'][100]

In [36]:
input_audio=prepare_input_audio(audio_path)
input_text=prepare_input_transcript(transcript=transcript)

In [37]:
audio_emb=model(input_audio)

In [38]:
text_emb=model(input_text)


In [39]:
import torch.nn.functional as F
sim=F.cosine_similarity(audio_emb['x1'],text_emb['c1'])

In [40]:
sim

tensor([-0.2302], device='cuda:0', grad_fn=<SumBackward1>)

In [13]:
from sampler import sample_dev_words


In [14]:
path=sample_dev_words('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/train_metadata.csv')

100%|██████████| 18844/18844 [58:46<00:00,  5.34it/s] 


dev_sampling complete


Unnamed: 0.1,Unnamed: 0,audio_path,transcript,duration,sampled_words
0,651,844424932602773-645-f_seg_9.wav,घंटी,2.732-3.013 sec,"{0: ['घंटी', 'घंटी'], 1: ['घंटे', 'घाटी'], 2: ..."
1,9070,844424930766406-261-f_seg_7.wav,के,3.498-3.699 sec,"{0: ['के', 'के'], 1: ['को', 'थे'], 2: ['है', '..."
2,20196,844424933460441-934-m_seg_4.wav,देशविशेष,2.296-2.940 sec,"{0: ['देशविशेष'], 3: ['विशेष', 'विशेष', 'विशेष..."
3,177,844424932581205-645-f_seg_10.wav,रवाना,3.508-3.790 sec,"{0: ['रवाना', 'रवाना'], 2: ['रोकना', 'करवाया']..."
4,3,844424930620027-1177-f_seg_3.wav,हिंसा,2.166-2.687 sec,"{0: ['हिंसा', 'हिंसा'], 2: ['हादसा', 'हिस्सा']..."
...,...,...,...,...,...
18839,11964,844424931102635-229-f_seg_8.wav,अपने,2.627-2.870 sec,"{0: ['अपने', 'अपने'], 1: ['सपने', 'अपनी'], 2: ..."
18840,21575,844424933559042-558-m_seg_7.wav,लाभ,5.095-5.416 sec,"{0: ['लाभ', 'लाभ'], 1: ['लाख', 'लाई'], 2: ['बा..."
18841,5390,844424932730849-579-f_seg_9.wav,होने,3.819-4.060 sec,"{0: ['होने', 'होने'], 1: ['होते', 'होना'], 2: ..."
18842,860,844424933525081-390-m_seg_9.wav,अपनी,7.899-8.220 sec,"{0: ['अपनी', 'अपनी'], 1: ['अपने', 'अपना'], 2: ..."


In [17]:
from dataset import get_dev_loader

loader=get_dev_loader('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_trainset(2).csv',batch_size=config_file['dev_batch_size'])



In [37]:
from tqdm import tqdm 
from metrics import crossview_ap,crossview_corr
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Evaluating Train")
model=model.to(DEVICE)
model.eval()
average_precision=0.0
average_corr=0.0
for idx,batch in tqdm(
                enumerate(loader), total=len(loader), leave=False
            ):
            
                mfcc=batch["mfcc"]
                mfcc=mfcc.view(-1,mfcc.shape[2],mfcc.shape[1])
                mfcc=mfcc.to(DEVICE)
                mfcc_input={"view1_x1":mfcc}
                audio_emb=model(mfcc_input)["x1"] 

                input_text_tensor=batch["sampled_one_hot"]
                batch_size=input_text_tensor.shape[0]
                sampled_shape=input_text_tensor.shape[1]
                input_text_tensor=input_text_tensor.view(input_text_tensor.shape[0]*sampled_shape,
                                                        input_text_tensor.shape[2],
                                                        input_text_tensor.shape[3])
                input_text_tensor=input_text_tensor.to(DEVICE)
                input_one_hot={"view2_c1":input_text_tensor}
                out_one_hot=model(input_one_hot)["c1"]
                text_emb=out_one_hot.view(batch_size,
                                        sampled_shape,
                                        out_one_hot.shape[1])
            
                lev_distances=batch["lev_scores"].to(DEVICE)
            
                ranked_ap=crossview_ap(audio_embedding=audio_emb,
                                    text_embedding=text_emb,
                                    lev_distances=lev_distances)
                ranked_corr=crossview_corr(audio_embedding=audio_emb,
                                        text_embedding=text_emb,
                                        lev_distances=lev_distances)
                average_precision+=ranked_ap
                average_corr+=ranked_corr
            

average_precision=average_precision/len(loader)
average_corr=average_corr/len(loader)


Evaluating Train


                                                 

In [38]:
average_precision

0.23969043265073126

In [22]:
average_corr

0.036177764585042446

In [26]:
import os 
checkpoint_path_list=os.listdir('/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/checkpoints')
root_path='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/checkpoints'

sample_path_list=checkpoint_path_list[300:]
average_ap_list=[]
average_corr_list=[]

for i in range(len(sample_path_list)):

    model=MultiViewRNN(config_file)
    optimizer = optim.Adam(model.parameters(), lr=config_file["lr"])
    
    model,optimizer,step=load_checkpoint(os.path.join(root_path,sample_path_list[i]),model,optimizer)
    model=model.to(DEVICE)

    print(f"{i}")
    model.eval()
    average_precision=0.0
    average_corr=0.0
    for idx,batch in tqdm(
                enumerate(loader), total=len(loader), leave=False
            ):
            
                mfcc=batch["mfcc"]
                mfcc=mfcc.view(-1,mfcc.shape[2],mfcc.shape[1])
                mfcc=mfcc.to(DEVICE)
                mfcc_input={"view1_x1":mfcc}
                audio_emb=model(mfcc_input)["x1"] 

                input_text_tensor=batch["sampled_one_hot"]
                batch_size=input_text_tensor.shape[0]
                sampled_shape=input_text_tensor.shape[1]
                input_text_tensor=input_text_tensor.view(input_text_tensor.shape[0]*sampled_shape,
                                                        input_text_tensor.shape[2],
                                                        input_text_tensor.shape[3])
                input_text_tensor=input_text_tensor.to(DEVICE)
                input_one_hot={"view2_c1":input_text_tensor}
                out_one_hot=model(input_one_hot)["c1"]
                text_emb=out_one_hot.view(batch_size,
                                        sampled_shape,
                                        out_one_hot.shape[1])
            
                lev_distances=batch["lev_scores"].to(DEVICE)
            
                ranked_ap=crossview_ap(audio_embedding=audio_emb,
                                    text_embedding=text_emb,
                                    lev_distances=lev_distances)
                ranked_corr=crossview_corr(audio_embedding=audio_emb,
                                        text_embedding=text_emb,
                                        lev_distances=lev_distances)
                average_precision+=ranked_ap
                average_corr+=ranked_corr
            

    average_precision=average_precision/len(loader)
    average_corr=average_corr/len(loader)
    
    average_ap_list.append(average_precision)
    average_corr_list.append(average_corr)

0


  torch.nn.functional.pad(torch.tensor(seq), (0, 0, 0, max_seq_length - len(seq)), 'constant', padding_value)
                                                 

1


                                                 

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt 
x=[i for i in range(300,300+len(sample_path_list))]
y=average_ap_list

plt.plot(x,y)
plt.title('Epoch vs Train AP')
plt.xlabel('Epochs')
plt.ylabel('Train_AP_per_epoch')



In [None]:
import matplotlib.pyplot as plt 
y=average_corr_list

plt.plot(x,y)
plt.title('Epoch vs Train corr')
plt.xlabel('Epochs')
plt.ylabel('Train_corr_per_epoch')



In [1]:
from dataset import get_train_loader
from utils import _load_config

config_file=_load_config()

csv_path='/home/ubuntu/acoustic_stuff/hindi-acoustic-word-embedding/dataset/train_aligned_dataset/sampled_trainset.csv'

loader=get_train_loader(csv_file=csv_path,batch_size=config_file['train_batch_size'],loss_fn=config_file['loss_fn'])
batch=next(iter(loader))

In [2]:
batch.keys()

dict_keys(['view1_x1', 'view2_c1', 'view2_c2', 'edit_distance', 'view1_x2'])

In [5]:
len(batch['edit_distance'])


64