In [1]:
import torch
import torch.functional as F
import torch.nn as nn

In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import wave
import matplotlib.pyplot as plt
from IPython.display import Audio

In [3]:
from hyperparameters import HyperParameters as hp

In [4]:
class Encoder(nn.Module):
    def __init__(self, encoder):
        self.encoder = encoder
    
    def load(self, target_network):
        self.encoder.load_state_dict(target_network.state_dict())

    def __call__(self, x):
        return self.encoder(x)

In [5]:
class Prenet(nn.Module):
    def __init__(self, c_in, c_h, c_out, 
            kernel_size, n_conv_blocks, 
            subsample, act, dropout_rate):
        super(Prenet, self).__init__()
        self.act = get_act(act)
        self.subsample = subsample
        self.n_conv_blocks = n_conv_blocks
        self.in_conv_layer = nn.Conv2d(1, c_h, kernel_size=kernel_size)
        self.first_conv_layers = nn.ModuleList([nn.Conv2d(c_h, c_h, kernel_size=kernel_size) for _ \
                in range(n_conv_blocks)])
        self.second_conv_layers = nn.ModuleList([nn.Conv2d(c_h, c_h, kernel_size=kernel_size, stride=sub) 
            for sub, _ in zip(subsample, range(n_conv_blocks))])
        output_size = c_in
        for l, sub in zip(range(n_conv_blocks), self.subsample):
            output_size = ceil(output_size / sub)
        self.out_conv_layer = nn.Conv1d(c_h * output_size, c_out, kernel_size=1)
        self.dropout_layer = nn.Dropout(p=dropout_rate)
        self.norm_layer = nn.InstanceNorm2d(c_h, affine=False)

    def forward(self, x):
        # reshape x to 4D
        x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2))
        out = pad_layer_2d(x, self.in_conv_layer)
        out = self.act(out)
        out = self.norm_layer(out)
        for l in range(self.n_conv_blocks):
            y = pad_layer_2d(out, self.first_conv_layers[l])
            y = self.act(y)
            y = self.norm_layer(y)
            y = self.dropout_layer(y)
            y = pad_layer_2d(y, self.second_conv_layers[l])
            y = self.act(y)
            y = self.norm_layer(y)
            y = self.dropout_layer(y)
            if self.subsample[l] > 1:
                out = F.avg_pool2d(out, kernel_size=self.subsample[l], ceil_mode=True)
            out = y + out
        out = out.contiguous().view(out.size(0), out.size(1) * out.size(2), out.size(3))
        out = pad_layer(out, self.out_conv_layer)
        out = self.act(out)
        return out

In [6]:
speakers = hp.speakers
n_speakers = hp.num_speakers

In [7]:
class ResidualBlock(nn.Module):
    def __init__(self):
        super(ResidualBlock, self).__init__()
        self.block = nn.ModuleList(
        nn.Conv2d(dim_in, dim_out, kernel_size=3, stride=1, padding=1, bias=False),
        nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True),
        nn.ReLU(inplace=True),
        nn.InstanceNorm2d(dim_out, affine=True, track_running_stats=True))
    
    def forward(self, x):
        return x + block(x)

In [8]:
class Generator(nn.Module):
    def __init__():
        super(Generator, self)
        layers = []

In [8]:
class Conv_Lrelu(nn.Module):
    def __init__(self, in_features, out_features):
        super(Conv_Lrelu, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_features,
                             out_channels=out_features,
                             stride=2,
                             kernel_size=4,
                             padding =1)
        self.lrelu = nn.LeakyReLU(0.01)
    
    def forward(x):
        out = self.conv(x)
        out = self.lrelu(x)
        
        return out

In [9]:
class Discriminator(nn.Module):
    def __init__(self, input_size=(36, 256), conv_dims = 64, num_speakers=hp.num_speakers, n_filters = 5):
        super(Discriminator, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=input_size,
                              out_channels=conv_dims,
                              stride=2,
                              kernel_size=4)
        self.lrelu = nn.LeakyReLU(0.01)
        self.conv_block = nn.ModuleList(
            [Conv_Lrelu(128*pow(2,i), 128*pow(2,i+1)) for i in range(n_filters)])
        kernel_size_0 = int(input_size[0] / np.power(2, n_filters)) # 1
        kernel_size_1 = int(input_size[1] / np.power(2, n_filters)) # 8
        
        self.conv_dis = nn.Conv2d(curr_dim, 1, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # padding should be 0
        self.conv_clf_spks = nn.Conv2d(curr_dim, num_speakers, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False)  # for num_speaker
        
    def forward(self, x):
        h = self.conv1(x)
        h = self.lrelu(h)
        h = self.conv_block(h)
        
        out_src = self.conv_dis(h)
        out_cls_spks = self.conv_clf_spks(h)
        return out_src, out_cls_spks.view(out_cls_spks.size(0), out_cls_spks.size(1))

In [10]:
disc = Discriminator()

TypeError: unsupported operand type(s) for %: 'tuple' and 'int'

In [29]:
librosa.feature.melspectrogram()

ParameterError: Audio data must be of type numpy.ndarray

In [30]:
disc

Discriminator(
  (conv1): Conv2d(234, 64, kernel_size=(3, 3), stride=(2, 2))
  (lrelu): LeakyReLU(negative_slope=0.01)
  (conv_block): ModuleList(
    (0): Conv_Lrelu(
      (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (lrelu): LeakyReLU(negative_slope=0.01)
    )
    (1): Conv_Lrelu(
      (conv): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (lrelu): LeakyReLU(negative_slope=0.01)
    )
    (2): Conv_Lrelu(
      (conv): Conv2d(512, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (lrelu): LeakyReLU(negative_slope=0.01)
    )
    (3): Conv_Lrelu(
      (conv): Conv2d(1024, 2048, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (lrelu): LeakyReLU(negative_slope=0.01)
    )
    (4): Conv_Lrelu(
      (conv): Conv2d(2048, 4096, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (lrelu): LeakyReLU(negative_slope=0.01)
    )
  )
)

In [16]:
pow(2,4)

16