In [1]:
import os
import torch
import torch.nn as nn
import torch

import numpy as np

import torchvision.transforms as transforms
from torchvision.utils import save_image

from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torch.autograd import Variable
import torch.autograd as autograd
import dycomutils
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch.nn.functional as F
import math


import os
import numpy as np
import matplotlib.pyplot as plt
import neptune
from dotenv import load_dotenv
import dycomutils
plt.rcParams.update({'font.size': 24})

import warnings
warnings.filterwarnings('ignore')

# Torch
import torch
import torchaudio
#from torch.utils import tensorboard
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
from typing import Tuple, List
from tqdm import tqdm

# Load Configs
load_dotenv()
cuda = True if torch.cuda.is_available() else False

# run = neptune.init_run(
#     project="Botz/Audio-MI",
#     name="sinc-net-training",
#     api_token=os.getenv("NEPTUNE_API_TOKEN")
# )

In [2]:
def flip(x, dim):
    xsize = x.size()
    dim = x.dim() + dim if dim < 0 else dim
    x = x.contiguous()
    x = x.view(-1, *xsize[dim:])
    x = x.view(x.size(0), x.size(1), -1)[:, getattr(torch.arange(x.size(1)-1, 
                      -1, -1), ('cpu','cuda')[x.is_cuda])().long(), :]
    return x.view(xsize)


def sinc(band,t_right):
    y_right= torch.sin(2*math.pi*band*t_right)/(2*math.pi*band*t_right)
    y_left= flip(y_right,0)

    y=torch.cat([y_left,Variable(torch.ones(1)).cuda(),y_right])

    return y
    

class SincConv_fast(nn.Module):
    """Sinc-based convolution
    Parameters
    ----------
    in_channels : `int`
        Number of input channels. Must be 1.
    out_channels : `int`
        Number of filters.
    kernel_size : `int`
        Filter length.
    sample_rate : `int`, optional
        Sample rate. Defaults to 16000.
    Usage
    -----
    See `torch.nn.Conv1d`
    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    """

    @staticmethod
    def to_mel(hz):
        return 2595 * np.log10(1 + hz / 700)

    @staticmethod
    def to_hz(mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50):

        super(SincConv_fast,self).__init__()

        if in_channels != 1:
            #msg = (f'SincConv only support one input channel '
            #       f'(here, in_channels = {in_channels:d}).')
            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
            raise ValueError(msg)

        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        # Forcing the filters to be odd (i.e, perfectly symmetrics)
        if kernel_size%2==0:
            self.kernel_size=self.kernel_size+1
            
        self.stride = stride
        self.padding = padding
        self.dilation = dilation

        if bias:
            raise ValueError('SincConv does not support bias.')
        if groups > 1:
            raise ValueError('SincConv does not support groups.')

        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # initialize filterbanks such that they are equally spaced in Mel scale
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)

        mel = np.linspace(self.to_mel(low_hz),
                          self.to_mel(high_hz),
                          self.out_channels + 1)
        hz = self.to_hz(mel)
        

        # filter lower frequency (out_channels, 1)
        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))

        # filter frequency band (out_channels, 1)
        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))

        # Hamming window
        #self.window_ = torch.hamming_window(self.kernel_size)
        n_lin=torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2))) # computing only half of the window
        self.window_=0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);


        # (1, kernel_size/2)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate # Due to symmetry, I only need half of the time axes

 


    def forward(self, waveforms):
        """
        Parameters
        ----------
        waveforms : `torch.Tensor` (batch_size, 1, n_samples)
            Batch of waveforms.
        Returns
        -------
        features : `torch.Tensor` (batch_size, out_channels, n_samples_out)
            Batch of sinc filters activations.
        """

        self.n_ = self.n_.to(waveforms.device)

        self.window_ = self.window_.to(waveforms.device)

        low = self.min_low_hz  + torch.abs(self.low_hz_)
        
        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),self.min_low_hz,self.sample_rate/2)
        band=(high-low)[:,0]
        
        f_times_t_low = torch.matmul(low, self.n_)
        f_times_t_high = torch.matmul(high, self.n_)

        band_pass_left=((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_ # Equivalent of Eq.4 of the reference paper (SPEAKER RECOGNITION FROM RAW WAVEFORM WITH SINCNET). I just have expanded the sinc and simplified the terms. This way I avoid several useless computations. 
        band_pass_center = 2*band.view(-1,1)
        band_pass_right= torch.flip(band_pass_left,dims=[1])
        
        
        band_pass=torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)

        
        band_pass = band_pass / (2*band[:,None])
        

        self.filters = (band_pass).view(
            self.out_channels, 1, self.kernel_size)

        return F.conv1d(waveforms, self.filters, stride=self.stride,
                        padding=self.padding, dilation=self.dilation,
                         bias=None, groups=1) 


        
        
class sinc_conv(nn.Module):

    def __init__(self, N_filt,Filt_dim,fs):
        super(sinc_conv,self).__init__()

        # Mel Initialization of the filterbanks
        low_freq_mel = 80
        high_freq_mel = (2595 * np.log10(1 + (fs / 2) / 700))  # Convert Hz to Mel
        mel_points = np.linspace(low_freq_mel, high_freq_mel, N_filt)  # Equally spaced in Mel scale
        f_cos = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
        b1=np.roll(f_cos,1)
        b2=np.roll(f_cos,-1)
        b1[0]=30
        b2[-1]=(fs/2)-100
                
        self.freq_scale=fs*1.0
        self.filt_b1 = nn.Parameter(torch.from_numpy(b1/self.freq_scale))
        self.filt_band = nn.Parameter(torch.from_numpy((b2-b1)/self.freq_scale))

        
        self.N_filt=N_filt
        self.Filt_dim=Filt_dim
        self.fs=fs
        

    def forward(self, x):
        
        filters=Variable(torch.zeros((self.N_filt,self.Filt_dim))).cuda()
        N=self.Filt_dim
        t_right=Variable(torch.linspace(1, (N-1)/2, steps=int((N-1)/2))/self.fs).cuda()
        
        
        min_freq=50.0;
        min_band=50.0;
        
        filt_beg_freq=torch.abs(self.filt_b1)+min_freq/self.freq_scale
        filt_end_freq=filt_beg_freq+(torch.abs(self.filt_band)+min_band/self.freq_scale)
       
        n=torch.linspace(0, N, steps=N)

        # Filter window (hamming)
        window=0.54-0.46*torch.cos(2*math.pi*n/N);
        window=Variable(window.float().cuda())

        
        for i in range(self.N_filt):
                        
            low_pass1 = 2*filt_beg_freq[i].float()*sinc(filt_beg_freq[i].float()*self.freq_scale,t_right)
            low_pass2 = 2*filt_end_freq[i].float()*sinc(filt_end_freq[i].float()*self.freq_scale,t_right)
            band_pass=(low_pass2-low_pass1)

            band_pass=band_pass/torch.max(band_pass)

            filters[i,:]=band_pass.cuda()*window

        out=F.conv1d(x, filters.view(self.N_filt,1,self.Filt_dim))
    
        return out
    

def act_fun(act_type):

 if act_type=="relu":
    return nn.ReLU()
            
 if act_type=="tanh":
    return nn.Tanh()
            
 if act_type=="sigmoid":
    return nn.Sigmoid()
           
 if act_type=="leaky_relu":
    return nn.LeakyReLU(0.2)
            
 if act_type=="elu":
    return nn.ELU()
                     
 if act_type=="softmax":
    return nn.LogSoftmax(dim=1)
        
 if act_type=="linear":
    return nn.LeakyReLU(1) # initializzed like this, but not used in forward!
            
            
class LayerNorm(nn.Module):

    def __init__(self, features, eps=1e-6):
        super(LayerNorm,self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta


class MLP(nn.Module):
    def __init__(self, options):
        super(MLP, self).__init__()
        
        self.input_dim=int(options['input_dim'])
        self.fc_lay=options['fc_lay']
        self.fc_drop=options['fc_drop']
        self.fc_use_batchnorm=options['fc_use_batchnorm']
        self.fc_use_laynorm=options['fc_use_laynorm']
        self.fc_use_laynorm_inp=options['fc_use_laynorm_inp']
        self.fc_use_batchnorm_inp=options['fc_use_batchnorm_inp']
        self.fc_act=options['fc_act']
        
       
        self.wx  = nn.ModuleList([])
        self.bn  = nn.ModuleList([])
        self.ln  = nn.ModuleList([])
        self.act = nn.ModuleList([])
        self.drop = nn.ModuleList([])
       

       
        # input layer normalization
        if self.fc_use_laynorm_inp:
           self.ln0=LayerNorm(self.input_dim)
          
        # input batch normalization    
        if self.fc_use_batchnorm_inp:
           self.bn0=nn.BatchNorm1d([self.input_dim],momentum=0.05)
           
           
        self.N_fc_lay=len(self.fc_lay)
             
        current_input=self.input_dim
        
        # Initialization of hidden layers
        
        for i in range(self.N_fc_lay):
            
         # dropout
         self.drop.append(nn.Dropout(p=self.fc_drop[i]))
         
         # activation
         self.act.append(act_fun(self.fc_act[i]))
         
         
         add_bias=True
         
         # layer norm initialization
         self.ln.append(LayerNorm(self.fc_lay[i]))
         self.bn.append(nn.BatchNorm1d(self.fc_lay[i],momentum=0.05))
         
         if self.fc_use_laynorm[i] or self.fc_use_batchnorm[i]:
             add_bias=False
         
              
         # Linear operations
         self.wx.append(nn.Linear(current_input, self.fc_lay[i],bias=add_bias))
         
         # weight initialization
         self.wx[i].weight = torch.nn.Parameter(torch.Tensor(self.fc_lay[i],current_input).uniform_(-np.sqrt(0.01/(current_input+self.fc_lay[i])),np.sqrt(0.01/(current_input+self.fc_lay[i]))))
         self.wx[i].bias = torch.nn.Parameter(torch.zeros(self.fc_lay[i]))
         
         current_input=self.fc_lay[i]
         
         
    def forward(self, x):
        
      # Applying Layer/Batch Norm
      if bool(self.fc_use_laynorm_inp):
        x=self.ln0((x))
        
      if bool(self.fc_use_batchnorm_inp):
        x=self.bn0((x))
        
      for i in range(self.N_fc_lay):

        if self.fc_act[i]!='linear':
            
          if self.fc_use_laynorm[i]:
           x = self.drop[i](self.act[i](self.ln[i](self.wx[i](x))))
          
          if self.fc_use_batchnorm[i]:
           x = self.drop[i](self.act[i](self.bn[i](self.wx[i](x))))
          
          if self.fc_use_batchnorm[i]==False and self.fc_use_laynorm[i]==False:
           x = self.drop[i](self.act[i](self.wx[i](x)))
           
        else:
          if self.fc_use_laynorm[i]:
           x = self.drop[i](self.ln[i](self.wx[i](x)))
          
          if self.fc_use_batchnorm[i]:
           x = self.drop[i](self.bn[i](self.wx[i](x)))
          
          if self.fc_use_batchnorm[i]==False and self.fc_use_laynorm[i]==False:
           x = self.drop[i](self.wx[i](x)) 
          
      return x



class SincNet(nn.Module):
    
    def __init__(self,options):
       super(SincNet,self).__init__()
    
       self.cnn_N_filt=options['cnn_N_filt']
       self.cnn_len_filt=options['cnn_len_filt']
       self.cnn_max_pool_len=options['cnn_max_pool_len']
       
       
       self.cnn_act=options['cnn_act']
       self.cnn_drop=options['cnn_drop']
       
       self.cnn_use_laynorm=options['cnn_use_laynorm']
       self.cnn_use_batchnorm=options['cnn_use_batchnorm']
       self.cnn_use_laynorm_inp=options['cnn_use_laynorm_inp']
       self.cnn_use_batchnorm_inp=options['cnn_use_batchnorm_inp']
       
       self.input_dim=int(options['input_dim'])
       
       self.fs=options['fs']
       
       self.N_cnn_lay=len(options['cnn_N_filt'])
       self.conv  = nn.ModuleList([])
       self.bn  = nn.ModuleList([])
       self.ln  = nn.ModuleList([])
       self.act = nn.ModuleList([])
       self.drop = nn.ModuleList([])
       
             
       if self.cnn_use_laynorm_inp:
           self.ln0=LayerNorm(self.input_dim)
           
       if self.cnn_use_batchnorm_inp:
           self.bn0=nn.BatchNorm1d([self.input_dim],momentum=0.05)
           
       current_input=self.input_dim 
       
       for i in range(self.N_cnn_lay):
         
         N_filt=int(self.cnn_N_filt[i])
         len_filt=int(self.cnn_len_filt[i])
         
         # dropout
         self.drop.append(nn.Dropout(p=self.cnn_drop[i]))
         
         # activation
         self.act.append(act_fun(self.cnn_act[i]))
                    
         # layer norm initialization         
         self.ln.append(LayerNorm([N_filt,int((current_input-self.cnn_len_filt[i]+1)/self.cnn_max_pool_len[i])]))

         self.bn.append(nn.BatchNorm1d(N_filt,int((current_input-self.cnn_len_filt[i]+1)/self.cnn_max_pool_len[i]),momentum=0.05))
            

         if i==0:
          self.conv.append(SincConv_fast(self.cnn_N_filt[0],self.cnn_len_filt[0],self.fs))
              
         else:
          self.conv.append(nn.Conv1d(self.cnn_N_filt[i-1], self.cnn_N_filt[i], self.cnn_len_filt[i]))
          
         current_input=int((current_input-self.cnn_len_filt[i]+1)/self.cnn_max_pool_len[i])

         
       self.out_dim=current_input*N_filt



    def forward(self, x):
       batch=x.shape[0]
       seq_len=x.shape[1]
       
       if bool(self.cnn_use_laynorm_inp):
        x=self.ln0((x))
        
       if bool(self.cnn_use_batchnorm_inp):
        x=self.bn0((x))
        
       x=x.view(batch,1,seq_len)

       
       for i in range(self.N_cnn_lay):
           
         if self.cnn_use_laynorm[i]:
          if i==0:
           x = self.drop[i](self.act[i](self.ln[i](F.max_pool1d(torch.abs(self.conv[i](x)), self.cnn_max_pool_len[i]))))  
          else:
           x = self.drop[i](self.act[i](self.ln[i](F.max_pool1d(self.conv[i](x), self.cnn_max_pool_len[i]))))   
          
         if self.cnn_use_batchnorm[i]:
          x = self.drop[i](self.act[i](self.bn[i](F.max_pool1d(self.conv[i](x), self.cnn_max_pool_len[i]))))

         if self.cnn_use_batchnorm[i]==False and self.cnn_use_laynorm[i]==False:
          x = self.drop[i](self.act[i](F.max_pool1d(self.conv[i](x), self.cnn_max_pool_len[i])))

       
       x = x.view(batch,-1)

       return x
   
def str_to_bool(s):
    if s == 'True':
         return True
    elif s == 'False':
         return False
    else:
         raise ValueError 

    
   

In [3]:
RESAMPLE_RATE = 6000

fs=f"{RESAMPLE_RATE}"
cw_len="1024"
cw_shift="10"   

cnn_N_filt="100,80,80"
cnn_len_filt="251,5,5"
cnn_max_pool_len="3,3,3"
cnn_use_laynorm_inp="True"
cnn_use_batchnorm_inp="False"
cnn_use_laynorm="True,True,True"
cnn_use_batchnorm="False,False,False"
cnn_act="leaky_relu,leaky_relu,leaky_relu"
cnn_drop="0.1,0.1,0.1"


fc_lay="2048,2048,2048"
fc_drop="0.1,0.1,0.1"
fc_use_laynorm_inp="True"
fc_use_batchnorm_inp="False"
fc_use_batchnorm="True,True,True"
fc_use_laynorm="False,False,False"
fc_act="leaky_relu,leaky_relu,leaky_relu"

class_lay="2"
class_drop="0.0"
class_use_laynorm_inp="False"
class_use_batchnorm_inp="False"
class_use_batchnorm="False"
class_use_laynorm="False"
class_act="softmax"

lr="0.0004"
batch_size="128"
N_epochs="1500"
N_batches="800"
N_eval_epoch="8"
seed="1234"

# %%
cnn_N_filt=list(map(int, cnn_N_filt.split(',')))
cnn_len_filt=list(map(int, cnn_len_filt.split(',')))
cnn_max_pool_len=list(map(int, cnn_max_pool_len.split(',')))
cnn_use_laynorm_inp=str_to_bool(cnn_use_laynorm_inp)
cnn_use_batchnorm_inp=str_to_bool(cnn_use_batchnorm_inp)
cnn_use_laynorm=list(map(str_to_bool, cnn_use_laynorm.split(',')))
cnn_use_batchnorm=list(map(str_to_bool, cnn_use_batchnorm.split(',')))
cnn_act=list(map(str, cnn_act.split(',')))
cnn_drop=list(map(float, cnn_drop.split(',')))


#[dnn]
fc_lay=list(map(int, fc_lay.split(',')))
fc_drop=list(map(float, fc_drop.split(',')))
fc_use_laynorm_inp=str_to_bool(fc_use_laynorm_inp)
fc_use_batchnorm_inp=str_to_bool(fc_use_batchnorm_inp)
fc_use_batchnorm=list(map(str_to_bool, fc_use_batchnorm.split(',')))
fc_use_laynorm=list(map(str_to_bool, fc_use_laynorm.split(',')))
fc_act=list(map(str, fc_act.split(',')))

#[class]
class_lay=list(map(int, class_lay.split(',')))
class_drop=list(map(float, class_drop.split(',')))
class_use_laynorm_inp=str_to_bool(class_use_laynorm_inp)
class_use_batchnorm_inp=str_to_bool(class_use_batchnorm_inp)
class_use_batchnorm=list(map(str_to_bool, class_use_batchnorm.split(',')))
class_use_laynorm=list(map(str_to_bool, class_use_laynorm.split(',')))
class_act=list(map(str, class_act.split(',')))

In [4]:
wlen = 4000

In [5]:
# %%
# Feature extractor CNN
CNN_arch = {
    'input_dim': wlen,
    'fs': int(fs),
    'cnn_N_filt': cnn_N_filt,
    'cnn_len_filt': cnn_len_filt,
    'cnn_max_pool_len':cnn_max_pool_len,
    'cnn_use_laynorm_inp': cnn_use_laynorm_inp,
    'cnn_use_batchnorm_inp': cnn_use_batchnorm_inp,
    'cnn_use_laynorm':cnn_use_laynorm,
    'cnn_use_batchnorm':cnn_use_batchnorm,
    'cnn_act': cnn_act,
    'cnn_drop':cnn_drop,          
}


CNN_net=SincNet(CNN_arch)
CNN_net.cuda()



DNN1_arch = {
    'input_dim': CNN_net.out_dim,
    'fc_lay': fc_lay,
    'fc_drop': fc_drop, 
    'fc_use_batchnorm': fc_use_batchnorm,
    'fc_use_laynorm': fc_use_laynorm,
    'fc_use_laynorm_inp': fc_use_laynorm_inp,
    'fc_use_batchnorm_inp':fc_use_batchnorm_inp,
    'fc_act': fc_act,
}

DNN1_net=MLP(DNN1_arch)
DNN1_net.cuda()


DNN2_arch = {'input_dim':fc_lay[-1] ,
          'fc_lay': class_lay,
          'fc_drop': class_drop, 
          'fc_use_batchnorm': class_use_batchnorm,
          'fc_use_laynorm': class_use_laynorm,
          'fc_use_laynorm_inp': class_use_laynorm_inp,
          'fc_use_batchnorm_inp':class_use_batchnorm_inp,
          'fc_act': class_act,
          }


DNN2_net=MLP(DNN2_arch)
DNN2_net.cuda()

# %%
inp = torch.randn(2,  wlen).cuda()
out1 = CNN_net(inp)
print(out1.shape)

print(CNN_net.out_dim)
out2 = DNN1_net(out1)
print(out2.shape)

# %%
pout=DNN2_net(DNN1_net(CNN_net(inp)))
print(pout.shape)

KeyboardInterrupt: 

In [None]:
class FullSincNet(nn.Module):
    def __init__(self, num_class, fs=6000, wlen=4000):
        super(FullSincNet, self).__init__()
        # %%
        # Feature extractor CNN
        self.CNN_arch = {
            'input_dim': wlen,
            'fs': int(fs),
            'cnn_N_filt': cnn_N_filt,
            'cnn_len_filt': cnn_len_filt,
            'cnn_max_pool_len':cnn_max_pool_len,
            'cnn_use_laynorm_inp': cnn_use_laynorm_inp,
            'cnn_use_batchnorm_inp': cnn_use_batchnorm_inp,
            'cnn_use_laynorm':cnn_use_laynorm,
            'cnn_use_batchnorm':cnn_use_batchnorm,
            'cnn_act': cnn_act,
            'cnn_drop':cnn_drop,          
        }


        self.CNN_net = SincNet(self.CNN_arch)



        self.DNN1_arch = {
            'input_dim': self.CNN_net.out_dim,
            'fc_lay': fc_lay,
            'fc_drop': fc_drop, 
            'fc_use_batchnorm': fc_use_batchnorm,
            'fc_use_laynorm': fc_use_laynorm,
            'fc_use_laynorm_inp': fc_use_laynorm_inp,
            'fc_use_batchnorm_inp':fc_use_batchnorm_inp,
            'fc_act': fc_act,
        }

        self.DNN1_net=MLP(self.DNN1_arch)


        self.DNN2_arch = {'input_dim':fc_lay[-1] ,
                'fc_lay': [num_class],
                'fc_drop': class_drop, 
                'fc_use_batchnorm': class_use_batchnorm,
                'fc_use_laynorm': class_use_laynorm,
                'fc_use_laynorm_inp': class_use_laynorm_inp,
                'fc_use_batchnorm_inp':class_use_batchnorm_inp,
                'fc_act': class_act,
                }


        self.DNN2_net=MLP(self.DNN2_arch)

    def set_weights(self,weights_path):
        
        _weights = torch.load(weights_path, weights_only=False)
        self.CNN_net.load_state_dict(_weights['CNN_net'])
        self.DNN1_net.load_state_dict(_weights['DNN1_net'])
        self.DNN2_net.load_state_dict(_weights['DNN2_net'])
        
        self.id_map = _weights['speaker_to_id_map']

    def forward(self, x):
        x = self.CNN_net(x)
        x = self.DNN1_net(x)
        x = self.DNN2_net(x)
        return x

In [None]:
gender_classfier = FullSincNet(num_class=2)
age_classfier = FullSincNet(num_class=3)
accent_classfier = FullSincNet(num_class=11)

In [None]:
gender_classfier.set_weights("/home/desild/work/academic/sem3/TrustworthyML-assignment/tacotron2/vctk/models/SINCNET_GENDER/20251129_160107/checkpoint.pth")
age_classfier.set_weights("/home/desild/work/academic/sem3/TrustworthyML-assignment/tacotron2/vctk/models/SINCNET_AGEg/20251129_184806/checkpoint.pth")
accent_classfier.set_weights("/home/desild/work/academic/sem3/TrustworthyML-assignment/tacotron2/vctk/models/SINCNET_ACCENTS/20251129_173841/checkpoint.pth")

In [None]:
gender_classfier.cuda()
age_classfier.cuda()
accent_classfier.cuda()

gender_classfier.eval()
age_classfier.eval()
accent_classfier.eval();

In [None]:
sr_weights = torch.load("/home/desild/work/academic/sem3/TrustworthyML-assignment/tacotron2/vctk/models/SINCNET_SR/20251129_142613/checkpoint.pth", weights_only=False)

In [None]:
speaker_ids = sr_weights['speaker_to_id_map']

id2speaker = {v: int(k) for k, v in speaker_ids.items()}


In [None]:
id2speaker

{0: 227,
 1: 244,
 2: 245,
 3: 248,
 4: 249,
 5: 251,
 6: 252,
 7: 253,
 8: 256,
 9: 261,
 10: 264,
 11: 268,
 12: 274,
 13: 275,
 14: 281,
 15: 288,
 16: 292,
 17: 293,
 18: 294,
 19: 295,
 20: 298,
 21: 301,
 22: 304,
 23: 306,
 24: 307,
 25: 311,
 26: 312,
 27: 314,
 28: 316,
 29: 323,
 30: 326,
 31: 335,
 32: 347,
 33: 374,
 34: 376}

In [None]:
speaker_data = pd.read_csv("/home/desild/work/academic/sem3/TrustworthyML-assignment/data/raw/vctk/train_data_top.csv")
speaker_data = speaker_data[["speaker_id", "AGEg", "GENDER", "ACCENTS"]].drop_duplicates().reset_index(drop=True).set_index("speaker_id").to_dict(orient="index")
speaker_data

{227: {'AGEg': '28<', 'GENDER': 'M', 'ACCENTS': 'English'},
 244: {'AGEg': '<20', 'GENDER': 'F', 'ACCENTS': 'English'},
 245: {'AGEg': '20-28', 'GENDER': 'M', 'ACCENTS': 'Irish'},
 248: {'AGEg': '20-28', 'GENDER': 'F', 'ACCENTS': 'Indian'},
 249: {'AGEg': '<20', 'GENDER': 'F', 'ACCENTS': 'Scottish'},
 251: {'AGEg': '20-28', 'GENDER': 'M', 'ACCENTS': 'Indian'},
 252: {'AGEg': '<20', 'GENDER': 'M', 'ACCENTS': 'Scottish'},
 253: {'AGEg': '<20', 'GENDER': 'F', 'ACCENTS': 'Welsh'},
 256: {'AGEg': '20-28', 'GENDER': 'M', 'ACCENTS': 'English'},
 261: {'AGEg': '20-28', 'GENDER': 'F', 'ACCENTS': 'NorthernIrish'},
 264: {'AGEg': '20-28', 'GENDER': 'F', 'ACCENTS': 'Scottish'},
 268: {'AGEg': '20-28', 'GENDER': 'F', 'ACCENTS': 'English'},
 274: {'AGEg': '<20', 'GENDER': 'M', 'ACCENTS': 'English'},
 275: {'AGEg': '20-28', 'GENDER': 'M', 'ACCENTS': 'Scottish'},
 281: {'AGEg': '28<', 'GENDER': 'M', 'ACCENTS': 'Scottish'},
 288: {'AGEg': '<20', 'GENDER': 'F', 'ACCENTS': 'Irish'},
 292: {'AGEg': '20-28

In [None]:
gender_classfier.id_map

{'F': 0, 'M': 1}

In [None]:
age_classfier.id_map

{'20-28': 0, '28<': 1, '<20': 2}

In [None]:
accent_classfier.id_map

{'American': 0,
 'Australian': 1,
 'Canadian': 2,
 'English': 3,
 'Indian': 4,
 'Irish': 5,
 'NewZealand': 6,
 'NorthernIrish': 7,
 'Scottish': 8,
 'SouthAfrican': 9,
 'Welsh': 10}

In [None]:
import copy

DATA_FOL = "/home/desild/work/academic/sem3/TrustworthyML-assignment/tacotron2/vctk/inverted_samples"

full_data = []
for init_types in list(os.listdir(DATA_FOL)):
    for speaker_type in list(os.listdir(os.path.join(DATA_FOL, init_types))):
        for inst_id in list(os.listdir(os.path.join(DATA_FOL, init_types, speaker_type))):
            for method in list(os.listdir(os.path.join(DATA_FOL, init_types, speaker_type, inst_id))):
                load_data = torch.load(os.path.join(DATA_FOL, init_types, speaker_type, inst_id, method))
                if "inverted_sample_gan" in method and "best_audio" in load_data.keys():
                    reload_data = copy.deepcopy(load_data)
                    
                    reload_data["best_x"] = load_data["best_audio"]
                    reload_data["best_z"] = load_data["best_x"]
                    del reload_data["best_audio"]
        
                    full_data.append({
                        "init_types": init_types,
                        "speaker_type": int(speaker_type),
                        "inst_id": inst_id,
                        "method": method.split("_")[-1].split(".")[0],
                        "path": os.path.join(DATA_FOL, init_types, speaker_type, inst_id, method),
                        **reload_data
                    })
                else:
                    
                    full_data.append({
                        "init_types": init_types,
                        "speaker_type": int(speaker_type),
                        "inst_id": inst_id,
                        "method": method.split("_")[-1].split(".")[0],
                        "path": os.path.join(DATA_FOL, init_types, speaker_type, inst_id, method),
                        **load_data
                    })

with tqdm(total=len(full_data), desc="entity") as pbar:
    for dinstance in full_data:
        # Training
        if dinstance["best_x"].shape[-1] > wlen:
            x = dinstance["best_x"][:,dinstance["best_x"].shape[-1]//2-wlen//2:dinstance["best_x"].shape[-1]//2+wlen//2]
        else:
            x = dinstance["best_x"]
        
        prob = gender_classfier(x)
        dinstance["pred_prob_gender"] = prob.detach().cpu().numpy()
        dinstance["pred_gender"] = np.argmax(dinstance["pred_prob_gender"], axis=1)[0]
        dinstance["true_gender"] = gender_classfier.id_map[speaker_data[id2speaker[dinstance["speaker_type"]]]["GENDER"]]
        
        prob = age_classfier(x)
        dinstance["pred_prob_age"] = prob.detach().cpu().numpy()
        dinstance["pred_age"] = np.argmax(dinstance["pred_prob_age"], axis=1)[0]
        dinstance["true_age"] = age_classfier.id_map[speaker_data[id2speaker[dinstance["speaker_type"]]]["AGEg"]]
        
        prob = accent_classfier(x)
        dinstance["pred_prob_accent"] = prob.detach().cpu().numpy()
        dinstance["pred_accent"] = np.argmax(dinstance["pred_prob_accent"], axis=1)[0]
        dinstance["true_accent"] = accent_classfier.id_map[speaker_data[id2speaker[dinstance["speaker_type"]]]["ACCENTS"]]
        pbar.update(1)

entity: 100%|██████████| 3150/3150 [00:17<00:00, 176.01it/s]


In [None]:
full_data_df = pd.DataFrame(full_data)

In [None]:
sel_df = full_data_df[["init_types", "speaker_type", "inst_id", "method", "true_gender", "pred_gender",   "true_age", "pred_age",  "true_accent", "pred_accent","time_taken"]]
sel_df

Unnamed: 0,init_types,speaker_type,inst_id,method,true_gender,pred_gender,true_age,pred_age,true_accent,pred_accent,time_taken
0,zeros,8,1,gan,1,1,0,1,3,0,13.171387
1,zeros,8,1,std,1,0,0,1,3,7,1.324532
2,zeros,8,1,sliding,1,0,0,1,3,0,17.878018
3,zeros,8,1,gan-std-transfer,1,0,0,1,3,0,45.970984
4,zeros,8,1,gan-normaud-transfer,1,0,0,1,3,0,43.903828
...,...,...,...,...,...,...,...,...,...,...,...
3145,white_noise_tanh,7,2,gan-normaud,0,0,2,0,10,4,10.064466
3146,white_noise_tanh,7,2,gan-normaud-750epochs,0,0,2,0,10,9,10.719225
3147,white_noise_tanh,7,2,gan-std,0,0,2,0,10,2,17.994435
3148,white_noise_tanh,7,2,gan-transfer,0,0,2,1,10,2,7.602250


In [None]:
sel_df["correct_gender"] = (sel_df["true_gender"] == sel_df["pred_gender"]).astype(int)
sel_df["correct_age"] = (sel_df["true_age"] == sel_df["pred_age"]).astype(int)
sel_df["correct_accent"] = (sel_df["true_accent"] == sel_df["pred_accent"]).astype(int)

In [None]:
sel_df.groupby(["init_types", "method"]).agg({"correct_gender": ["mean", "std"], "correct_age": ["mean", "std"], "correct_accent": ["mean", "std"], "time_taken": ["mean", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,correct_gender,correct_gender,correct_age,correct_age,correct_accent,correct_accent,time_taken,time_taken
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
init_types,method,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
laplace,gan,0.609524,0.490197,0.219048,0.415585,0.066667,0.25064,17.044098,10.779227
laplace,gan-750epochs,0.495238,0.502375,0.4,0.492248,0.07619,0.266575,15.490664,9.106895
laplace,gan-normaud,0.47619,0.501828,0.266667,0.444338,0.095238,0.294951,19.053647,15.467319
laplace,gan-normaud-750epochs,0.52381,0.501828,0.390476,0.490197,0.085714,0.281284,15.757564,9.122201
laplace,gan-normaud-transfer,0.485714,0.502193,0.142857,0.351605,0.066667,0.25064,21.890042,17.536383
laplace,gan-std,0.657143,0.476941,0.266667,0.444338,0.114286,0.319684,26.08191,9.880595
laplace,gan-std-transfer,0.533333,0.50128,0.161905,0.37013,0.066667,0.25064,29.085286,15.965088
laplace,gan-transfer,0.495238,0.502375,0.161905,0.37013,0.066667,0.25064,21.570164,15.859191
laplace,sliding,0.52381,0.501828,0.161905,0.37013,0.114286,0.319684,26.497959,2.910481
laplace,std,0.571429,0.497245,0.209524,0.408921,0.07619,0.266575,8.988685,1.276582
