## A deeper dive into the model

This notebook contains the code for creating the Saliency maps for each speaker condition,e.g. number of speakers in a sample. The saliency maps visualize which parts of the input are used by the model for each of the speaker conditions.

In [None]:
import tensorflow as tf 
import tensorflow.keras as keras

from model import crnn 
from src import data
from src import error_function
from src.data import DataSet
import wandb
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import glob 
import datetime
import json
import pdb
from model import crnn

In [None]:
SAVED_MODEL_DIR = '/vol/tensusers3/camghane/ASR/MLS/mls_experiment1_weights/MLS_exp1_model-best.h5'
# SAVED_MODEL_DIR = 'pretrained_models/model-best-baseline.h5'

In [None]:
model = tf.keras.models.load_model(SAVED_MODEL_DIR, custom_objects={
    'class_mae': error_function.class_mae
})

In [None]:
DATA_DIR = '/vol/tensusers3/camghane/ASR/LibriSpeech_test_clean/data/test-clean/merged/train/*/*.wav'
BATCH_SIZE = 32
### loading the data
filenames = glob.glob(DATA_DIR)
filenames_alt = {str(k):[] for k in range(0,11) }
for filename in filenames:
    filenames_alt[filename.split('/')[-2]].append(filename)

In [None]:
for nr_of_speakers, files_list in filenames_alt.items():
    test_dataset = DataSet(files_list, scale_data = True).get_data()

    audio_data = None
    labels_data = None
    for a, l in test_dataset.as_numpy_iterator():
        labels_data = l
        audio_data = a

    saliency_map = np.zeros((500,201))
    for i in range(len(audio_data)):
        with tf.GradientTape() as tape:
            audio = tf.Variable(audio_data)
            tape.watch(audio)
            pred = model(audio, training=False)
            predictions_converted = [int(np.argmax(x)) for x in pred.numpy()]
            loss = pred[i][predictions_converted[i]]

        grads = tape.gradient(loss, audio)
        df_grads = tf.math.abs(grads[i])
        dgrad_max_ = df_grads
        arr_min, arr_max  = np.min(dgrad_max_), np.max(dgrad_max_)
        grad_eval = (dgrad_max_ - arr_min) / (arr_max - arr_min + 1e-18)
        nr_of_speakers = np.argmax(labels_data[i])
        saliency_map += tf.reshape(grad_eval, (500,201)).numpy()
        
    plt.imshow(saliency_map.T, vmin=0,vmax=1)
    plt.colorbar()
    plt.title(f'Saliency map for {nr_of_speakers} speakers')
    plt.ylim([0,200])
    plt.xlim([0,500])
    plt.xlabel('Time (ms)')
    plt.ylabel('Frequency (Hz)')
    plt.savefig(f'english_Multilingual_model_Saliency_{nr_of_speakers}_speakers.png', bbox_inches='tight')
    plt.show()