In [1]:
import torch

In [7]:
import matplotlib.pyplot as plt
import librosa
from torch import nn as nn
from torch.functional import F
import pandas as pd
import os
import numpy as np

In [10]:
def pad_layer(inp, layer, pad_type='reflect'):
    kernel_size = layer.kernel_size[0]
    if kernel_size % 2 == 0:
        pad = (kernel_size//2, kernel_size//2 - 1)
    else:
        pad = (kernel_size//2, kernel_size//2)
    # padding
    inp = F.pad(inp, 
            pad=pad,
            mode=pad_type)
    out = layer(inp)
    return out

In [11]:
def pad_layer_2d(inp, layer, pad_type='reflect'):
    kernel_size = layer.kernel_size
    if kernel_size[0] % 2 == 0:
        pad_lr = [kernel_size[0]//2, kernel_size[0]//2 - 1]
    else:
        pad_lr = [kernel_size[0]//2, kernel_size[0]//2]
    if kernel_size[1] % 2 == 0:
        pad_ud = [kernel_size[1]//2, kernel_size[1]//2 - 1]
    else:
        pad_ud = [kernel_size[1]//2, kernel_size[1]//2]
    pad = tuple(pad_lr + pad_ud)
    # padding
    inp = F.pad(inp, 
            pad=pad,
            mode=pad_type)
    out = layer(inp)
    return out

In [15]:
class SpeakerEncoder(nn.Module):
    def __init__(self, c_in, c_h, c_out,kernel_size,
                bank_size, bank_scale, c_bank,
                n_conv_blocks, n_dense_blocks,
                subsample, act, dropout_rate):
        super(SpeakerEncoder, self).__init__()
        self.c_in = c_in
        self.c_h = c_h
        self.c_out = c_out
        self.kernel_size = kernel_size
        self.n_conv_blocks = n_conv_blocks
        self.n_dense_blocks = n_dense_blocks
        self.subsample = subsample
        
        self.conv_bank = nn.ModuleList([nn.Conv1d(c_in, c_bank, kernel_size=k) for k in range(bank_scale, bank_size+1, bank_scale)])
        in_channels = c_bank*(bank_size//bank_size)+c_in
        self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size)
        self.first_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size = kernel_size) for _ in range(n_conv_blocks)])
        self.second_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub) 
            for sub, _ in zip(subsample, range(n_conv_blocks))])
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.first_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)])
        self.second_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)])
        self.output_layer = nn.Linear(c_h, c_out)
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def conv_blocks(self, inp):
        out = inp
        for i in range(self.n_conv_blocks):
            y = pad_layer(out, self.first_conv_layers[i])
            y = self.act(y)
            y = self.dropout(y)
            y = pad_layer(y, self.second_conv_layers[l])
            y = self.act(y)
            y = self.dropout(y)
            if self.subsample[l]>1:
                out = F.avg_pool1d(out, kernel_size = self.subsample[l], ceil_mode = True)
            out = y+out
        return out
    def dense_blocks(self, inp):
        out = inp
        for l in range(self.n_dense_blocks):
            y = self.first_dense_layers[l](out)
            y = self.act(y)
            y = self.dropout(y)
            y = self.second_dense_layers[l](y)
            y = self.act(y)
            y = self.dropout(y)
        out = y+out
        return out
    
    def forward(self, x):
        out = conv_bank(x, self.conv_bank, act = self.act)
        out = pad_layer(out, self.in_conv_layer)
        out = self.act(out)
        out = self.conv_blocks(out)
        out = self.pooling(out).squeeze(2)
        out = self.dense_blocks(out)
        out = self.output_layer(out)
        return out