In [1]:
import os
import numpy as np
import argparse
import torch
import time
import librosa
import pickle

import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
import pdb

import preprocess
from trainingDataset import trainingDataset, trainingDataset_Paired
from model_VC2 import Generator, Discriminator

In [2]:
# Models
class GLU(nn.Module):
    def __init__(self):
        super(GLU, self).__init__()
        # Custom Implementation because the Voice Conversion Cycle GAN
        # paper assumes GLU won't reduce the dimension of tensor by 2.

    def forward(self, input):
        return input * torch.sigmoid(input)


class up_2Dsample(nn.Module):
    def __init__(self, upscale_factor=2):
        super(up_2Dsample, self).__init__()
        self.scale_factor = upscale_factor

    def forward(self, input):
        h = input.shape[2]
        w = input.shape[3]
        new_size = [h * self.scale_factor, w * self.scale_factor]
        return F.interpolate(input,new_size)
       

class PixelShuffle(nn.Module):
    def __init__(self, upscale_factor=2):
        super(PixelShuffle, self).__init__()
        # Custom Implementation because PyTorch PixelShuffle requires,
        # 4D input. Whereas, in this case we have have 3D array
        self.upscale_factor = upscale_factor

    def forward(self, input):
        n = input.shape[0]
        c_out = input.shape[1] // self.upscale_factor
        w_new = input.shape[2] * self.upscale_factor
        return input.view(n, c_out, w_new)


class ResidualLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ResidualLayer, self).__init__()
        self.conv1d_layer = nn.Sequential(nn.Conv1d(in_channels=in_channels,
                                                    out_channels=out_channels,
                                                    kernel_size=kernel_size,
                                                    stride=1,
                                                    padding=padding),
                                          nn.InstanceNorm1d(num_features=out_channels,
                                                            affine=True))

        self.conv_layer_gates = nn.Sequential(nn.Conv1d(in_channels=in_channels,
                                                        out_channels=out_channels,
                                                        kernel_size=kernel_size,
                                                        stride=1,
                                                        padding=padding),
                                              nn.InstanceNorm1d(num_features=out_channels,
                                                                affine=True))

        self.conv1d_out_layer = nn.Sequential(nn.Conv1d(in_channels=out_channels,
                                                        out_channels=in_channels,
                                                        kernel_size=kernel_size,
                                                        stride=1,
                                                        padding=padding),
                                              nn.InstanceNorm1d(num_features=in_channels,
                                                                affine=True))

    def forward(self, input):
        #print("input size: ", input.size())
        h1_norm = self.conv1d_layer(input)
        h1_gates_norm = self.conv_layer_gates(input)

        # GLU
        h1_glu = h1_norm * torch.sigmoid(h1_gates_norm)

        h2_norm = self.conv1d_out_layer(h1_glu)
        return input + h2_norm # (B, C, T)


class downSample_Generator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(downSample_Generator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       nn.InstanceNorm2d(num_features=out_channels,
                                                         affine=True))
        self.convLayer_gates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                       out_channels=out_channels,
                                                       kernel_size=kernel_size,
                                                       stride=stride,
                                                       padding=padding),
                                             nn.InstanceNorm2d(num_features=out_channels,
                                                               affine=True))

    def forward(self, input):
        return self.convLayer(input) * torch.sigmoid(self.convLayer_gates(input))


class upSample_Generator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(upSample_Generator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       #PixelShuffle(upscale_factor=2),
                                       up_2Dsample(upscale_factor=2),
                                       nn.InstanceNorm2d(num_features=out_channels,
                                                         affine=True))
        self.convLayer_gates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                       out_channels=out_channels,
                                                       kernel_size=kernel_size,
                                                       stride=stride,
                                                       padding=padding),
                                             #PixelShuffle(upscale_factor=2),
                                             up_2Dsample(upscale_factor=2),
                                             nn.InstanceNorm2d(num_features=out_channels,
                                                               affine=True))
    def forward(self, input):        
        return self.convLayer(input) * torch.sigmoid(self.convLayer_gates(input))


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=128,
                               kernel_size=[5,15],
                               stride=1,
                               padding=[2,7])

        self.conv1_gates = nn.Conv2d(in_channels=1,
                               out_channels=128,
                               kernel_size=[5,15],
                               stride=1,
                               padding=[2,7])

        # Downsample Layer
        self.downSample1 = downSample_Generator(in_channels=128,
                                                out_channels=256,
                                                kernel_size=5,
                                                stride=2,
                                                padding=2)

        self.downSample2 = downSample_Generator(in_channels=256,
                                                out_channels=512,
                                                kernel_size=5,
                                                stride=2,
                                                padding=2)
        #reshape
        self.conv2 = nn.Conv1d(in_channels=3072,
                               out_channels=512,
                               kernel_size=1,
                               stride=1)

        # Residual Blocks
        self.residualLayer1 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer2 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer3 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
    def forward(self, input):
        # GLU
        input = input.unsqueeze(1)
        conv1 = self.conv1(input) * torch.sigmoid(self.conv1_gates(input))
        # print("shape of conv1, ", conv1.size())
        downsample1 = self.downSample1(conv1)
        # print("shape of downsample1, ", downsample1.size())
        self.downsample2_forshape = self.downSample2(downsample1)
        downsample3 = self.downsample2_forshape.view([self.downsample2_forshape.shape[0],-1,self.downsample2_forshape.shape[3]])
        downsample3 = self.conv2(downsample3)
        # print("shape of downsample3, ", downsample3.size())
        residual_layer_1 = self.residualLayer1(downsample3)
        residual_layer_2 = self.residualLayer2(residual_layer_1)
        # print("shape of residual_layer_2, ", residual_layer_2.size())
        residual_layer_3 = self.residualLayer3(residual_layer_2)
        # print("shape of residual_layer_3, ", residual_layer_3.size())
        return residual_layer_3

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.residualLayer4 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer5 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer6 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        #reshape
        self.conv3 = nn.Conv1d(in_channels=512,
                               out_channels=3072,
                               kernel_size=1,
                               stride=1)


        # UpSample Layer
        self.upSample1 = upSample_Generator(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=5,
                                            stride=1,
                                            padding=2)
        
        self.upSample2 = upSample_Generator(in_channels=1024,
                                            out_channels=512,
                                            kernel_size=5,
                                            stride=1,
                                            padding=2)

        self.lastConvLayer = nn.Conv2d(in_channels=512,
                                       out_channels=1,
                                       kernel_size=[5,15],
                                       stride=1,
                                       padding=[2,7])

    def forward(self, input, shapes):
        # GLU
        residual_layer_4 = self.residualLayer4(input)
        residual_layer_5 = self.residualLayer5(residual_layer_4)
        residual_layer_6 = self.residualLayer6(residual_layer_5)
        residual_layer_6 = self.conv3(residual_layer_6)
        residual_layer_6 = residual_layer_6.view([shapes[0],shapes[1],shapes[2],shapes[3]])
        
        upSample_layer_1 = self.upSample1(residual_layer_6)
        upSample_layer_2 = self.upSample2(upSample_layer_1)
        output = self.lastConvLayer(upSample_layer_2)
        output = output.view([output.shape[0],-1,output.shape[3]])
        return output


class CNN_Discriminator(nn.Module):
    def __init__(self, Cin=1024, Tmax=256): # all inputs are padded to Tmax 
        super(CNN_Discriminator, self).__init__()
        self.Tmax = Tmax
        Conv_dim = 256
        self.conv = nn.Conv1d(in_channels=Cin,
                              out_channels=Conv_dim,
                              kernel_size=3,
                              stride=1)
        self.avgPool = nn.AvgPool1d(kernel_size=Tmax-2)
        self.linear = nn.Linear(Conv_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x): # input tensor in shape (B, Cin, Tin)
        # print("input size at discriminator ,", x.size())
        B, Cin, Tin = x.shape
        x = x.unsqueeze(1) # (B, 1, Cin, Tin)
        # print("input size after unsqueeze ,", x.size())
        Tpad_right = self.Tmax - Tin
        padder = nn.ZeroPad2d((0,Tpad_right,0,0))
        x = padder(x) # (B, 1, Cin, Tmax)
        # print("input size after pad ,", x.size())
        x = x.squeeze() # (B, Cin, Tmax)
        # print("input size before conv ,", x.size())
        x = self.conv(x) # (B, Conv_dim, Tmax-2)
        x = self.avgPool(x) # (B, Conv_dim, 1)
        x = x.squeeze() # (B, Conv_dim)
        x = self.linear(x) # (B, 1)
        x = self.sigmoid(x)
        return x
    
class RNN_Discriminator(nn.Module):
    def __init__(self, Cin=1024): # all inputs are padded to Tmax 
        super(RNN_Discriminator, self).__init__()
        Conv_dim = 256
        self.conv = nn.Conv1d(in_channels=Cin,
                              out_channels=Conv_dim,
                              kernel_size=3,
                              stride=1)
        hidden = 256
        self.gru = nn.GRU(input_size=Conv_dim, 
                          hidden_size=hidden)
        self.linear = nn.Linear(hidden, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x): # input tensor in shape (B, Cin, Tin)
        B, Cin, Tin = x.shape
        x = self.conv(x) # (B, Conv_dim, Tin-2)
        x = x.permute(2, 0, 1) # (Tin-2, B, Conv_dim)
        x, hout = self.gru(x) # hout: (1, B, hidden)
        hout = hout.squeeze() # hout: (B, hidden)
        hout = self.linear(hout) # (B, 1)
        hout = self.sigmoid(hout) # (B, 1)
        return hout



class DownSample_Discriminator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(DownSample_Discriminator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       nn.InstanceNorm2d(num_features=out_channels,
                                                         affine=True))
        self.convLayerGates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                      out_channels=out_channels,
                                                      kernel_size=kernel_size,
                                                      stride=stride,
                                                      padding=padding),
                                            nn.InstanceNorm2d(num_features=out_channels,
                                                              affine=True))

    def forward(self, input):
        # GLU
        return self.convLayer(input) * torch.sigmoid(self.convLayerGates(input))
        

class RealFake_Discriminator(nn.Module):
    def __init__(self):
        super(RealFake_Discriminator, self).__init__()

        self.convLayer1 = nn.Conv2d(in_channels=1,
                                    out_channels=128,
                                    kernel_size=[3, 3],
                                    stride=[1, 1])
        self.convLayer1_gates = nn.Conv2d(in_channels=1,
                                          out_channels=128,
                                          kernel_size=[3, 3],
                                          stride=[1, 1])

        # Note: Kernel Size have been modified in the PyTorch implementation
        # compared to the actual paper, as to retain dimensionality. Unlike,
        # TensorFlow, PyTorch doesn't have padding='same', hence, kernel sizes
        # were altered to retain the dimensionality after each layer

        # DownSample Layer
        self.downSample1 = DownSample_Discriminator(in_channels=128,
                                                    out_channels=256,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample2 = DownSample_Discriminator(in_channels=256,
                                                    out_channels=512,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample3 = DownSample_Discriminator(in_channels=512,
                                                    out_channels=1024,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample4 = DownSample_Discriminator(in_channels=1024,
                                                    out_channels=1024,
                                                    kernel_size=[1, 5],
                                                    stride=[1, 1],
                                                    padding=[0, 2])

        # Fully Connected Layer
        self.fc = nn.Linear(in_features=1024,
                            out_features=1)

        # output Layer
        self.output_layer = nn.Conv2d(in_channels=1024,
                                      out_channels=1,
                                      kernel_size=[1, 3],
                                      stride=[1, 1],
                                      padding=[0, 1])

    def forward(self, input):
        # input has shape [batch_size, num_features, time]
        # discriminator requires shape [batchSize, 1, num_features, time]
        input = input.unsqueeze(1)
        # GLU
        pad_input = nn.ZeroPad2d((1, 1, 1, 1))
        layer1 = self.convLayer1(
            pad_input(input)) * torch.sigmoid(self.convLayer1_gates(pad_input(input)))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample1 = self.downSample1(pad_input(layer1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample2 = self.downSample2(pad_input(downSample1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample3 = self.downSample3(pad_input(downSample2))

        downSample4 = self.downSample4(downSample3)
        downSample4 = self.output_layer(downSample4)

        downSample4 = downSample4.contiguous().permute(0, 2, 3, 1).contiguous()
        # fc = torch.sigmoid(self.fc(downSample3))
        # Taking off sigmoid layer to avoid vanishing gradient problem
        #fc = self.fc(downSample4)
        fc = torch.sigmoid(downSample4)
        return fc

class AB_Discriminator_RNN(nn.Module):
    def __init__(self, Cin=512): # (B, Cin, Tin)
        super(AB_Discriminator_RNN, self).__init__()

        self.convLayer1 = nn.Conv2d(in_channels=1,
                                    out_channels=128,
                                    kernel_size=[3, 3],
                                    stride=[1, 1])
        self.convLayer1_gates = nn.Conv2d(in_channels=1,
                                          out_channels=128,
                                          kernel_size=[3, 3],
                                          stride=[1, 1])

        # Note: Kernel Size have been modified in the PyTorch implementation
        # compared to the actual paper, as to retain dimensionality. Unlike,
        # TensorFlow, PyTorch doesn't have padding='same', hence, kernel sizes
        # were altered to retain the dimensionality after each layer

        # DownSample Layer
        self.downSample1 = DownSample_Discriminator(in_channels=128,
                                                    out_channels=256,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample2 = DownSample_Discriminator(in_channels=256,
                                                    out_channels=512,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample3 = DownSample_Discriminator(in_channels=512,
                                                    out_channels=1024,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample4 = DownSample_Discriminator(in_channels=1024,
                                                    out_channels=1024,
                                                    kernel_size=[1, 5],
                                                    stride=[1, 1],
                                                    padding=[0, 2])

        # Fully Connected Layer
        self.fc = nn.Linear(in_features=1024,
                            out_features=1)

        # output Layer
        self.output_layer = nn.Conv2d(in_channels=1024,
                                      out_channels=1,
                                      kernel_size=[1, 3],
                                      stride=[1, 1],
                                      padding=[0, 1])

    def forward(self, input):
        # input has shape [batch_size, num_features, time]
        # discriminator requires shape [batchSize, 1, num_features, time]
        input = input.unsqueeze(1)
        # GLU
        pad_input = nn.ZeroPad2d((1, 1, 1, 1))
        layer1 = self.convLayer1(
            pad_input(input)) * torch.sigmoid(self.convLayer1_gates(pad_input(input)))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample1 = self.downSample1(pad_input(layer1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample2 = self.downSample2(pad_input(downSample1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample3 = self.downSample3(pad_input(downSample2))

        downSample4 = self.downSample4(downSample3)
        downSample4 = self.output_layer(downSample4)

        downSample4 = downSample4.contiguous().permute(0, 2, 3, 1).contiguous()
        # fc = torch.sigmoid(self.fc(downSample3))
        # Taking off sigmoid layer to avoid vanishing gradient problem
        #fc = self.fc(downSample4)
        fc = torch.sigmoid(downSample4)
        return fc

In [3]:
# Helper functions

def adjust_lr_rate(optimizer, name='generator'):
    global generator_lr, generator_lr_decay, discriminator_lr, discriminator_lr_decay
    if name == 'generator':
        generator_lr = max(
            0., generator_lr - generator_lr_decay)
        for param_groups in optimizer.param_groups:
            param_groups['lr'] = generator_lr
    else:
        discriminator_lr = max(
            0., discriminator_lr - discriminator_lr_decay)
        for param_groups in optimizer.param_groups:
            param_groups['lr'] = discriminator_lr

def reset_grad():
    encoder_noCNNRNN_optimizer.zero_grad()
    decoder_2A_noCNNRNN_optimizer.zero_grad()
    decoder_2B_noCNNRNN_optimizer.zero_grad()
    
    realfake_discriminator_B_optimizer.zero_grad()
    realfake_discriminator_A_optimizer.zero_grad()
#     rnn_discriminator_optimizer.zero_grad()

def savePickle(variable, fileName):
    with open(fileName, 'wb') as f:
        pickle.dump(variable, f)

def loadPickleFile(fileName):
    with open(fileName, 'rb') as f:
        return pickle.load(f)

def store_to_file(doc):
    doc = doc + "\n"
    with open(file_name, "a") as myfile:
        myfile.write(doc)

def saveModelCheckPoint(epoch, PATH):
    torch.save({
        'epoch': epoch,
        'generator_cycle_loss_store': generator_cycle_loss_store,
        'generator_identity_loss_store': generator_identity_loss_store,
        'generator_supervise_loss_store':generator_supervise_loss_store,
        'RF_discriminator_loss_store': RF_discriminator_loss_store,
        'embedding_loss_store': embedding_loss_store,
        
        'encoder_noCNNRNN_state_dict': encoder_noCNNRNN.state_dict(),
        'decoder_2A_noCNNRNN_state_dict': decoder_2A_noCNNRNN.state_dict(),
        'decoder_2B_noCNNRNN_state_dict': decoder_2B_noCNNRNN.state_dict(),
#         'rnn_discriminator_state_dict': rnn_discriminator.state_dict(),
        'realfake_discriminator_A_state_dict': realfake_discriminator_A.state_dict(),
        'realfake_discriminator_B_state_dict': realfake_discriminator_B.state_dict(),
        
        'encoder_noCNNRNN_optimizer_state_dict': encoder_noCNNRNN_optimizer.state_dict(),
        'decoder_2A_noCNNRNN_optimizer_state_dict': decoder_2A_noCNNRNN_optimizer.state_dict(),
        'decoder_2B_noCNNRNN_optimizer_state_dict': decoder_2B_noCNNRNN_optimizer.state_dict(),
        'realfake_discriminator_A_optimizer_state_dict': realfake_discriminator_A_optimizer.state_dict(),
        'realfake_discriminator_B_optimizer_state_dict': realfake_discriminator_B_optimizer.state_dict(),
#         'rnn_discriminator_optimizer_state_dict': rnn_discriminator_optimizer.state_dict(),
        
        
    }, PATH)

def loadModel(PATH):
    checkPoint = torch.load(PATH)
    epoch = int(checkPoint['epoch']) + 1
    generator_cycle_loss_store = checkPoint['generator_cycle_loss_store']
    generator_identity_loss_store = checkPoint['generator_identity_loss_store']
    #generator_supervise_loss_store = checkPoint['generator_supervise_loss_store']
    RF_discriminator_loss_store = checkPoint['RF_discriminator_loss_store']
    embedding_loss_store = checkPoint['embedding_loss_store']
    
    encoder_noCNNRNN.load_state_dict(
        state_dict=checkPoint['encoder_noCNNRNN_state_dict'])
    decoder_2A_noCNNRNN.load_state_dict(
        state_dict=checkPoint['decoder_2A_noCNNRNN_state_dict'])
    decoder_2B_noCNNRNN.load_state_dict(
        state_dict=checkPoint['decoder_2B_noCNNRNN_state_dict'])
#     rnn_discriminator.load_state_dict(
#         state_dict=checkPoint['rnn_discriminator_state_dict'])
    realfake_discriminator_A.load_state_dict(
        state_dict=checkPoint['realfake_discriminator_A_state_dict'])
    realfake_discriminator_B.load_state_dict(
        state_dict=checkPoint['realfake_discriminator_B_state_dict'])
    
    encoder_noCNNRNN_optimizer.load_state_dict(
        state_dict=checkPoint['encoder_noCNNRNN_optimizer_state_dict'])
    decoder_2A_noCNNRNN_optimizer.load_state_dict(
        state_dict=checkPoint['decoder_2A_noCNNRNN_optimizer_state_dict'])
    decoder_2B_noCNNRNN_optimizer.load_state_dict(
        state_dict=checkPoint['decoder_2B_noCNNRNN_optimizer_state_dict'])
    realfake_discriminator_A_optimizer.load_state_dict(
        state_dict=checkPoint['realfake_discriminator_A_optimizer_state_dict'])
    realfake_discriminator_B_optimizer.load_state_dict(
        state_dict=checkPoint['realfake_discriminator_B_optimizer_state_dict'])
#     rnn_discriminator_optimizer.load_state_dict(
#         state_dict=checkPoint['rnn_discriminator_optimizer_state_dict'])
    
    return epoch

In [4]:
# set up
logf0s_normalization = "./cache/logf0s_normalization.npz"
mcep_normalization = "./cache/mcep_normalization.npz"
coded_sps_A_norm = "./cache/coded_sps_A_norm.pickle"
coded_sps_B_norm = "./cache/coded_sps_B_norm.pickle" 
resume_training_at = "./cache/model_checkpoint/_CycleGAN_CheckPoint"
validation_A_dir = "./data/evaluation_all/SF1/" 
output_A_dir = "./data/vcc2016_training/converted_sound/SF1"
validation_B_dir = "./data/evaluation_all/TF2/" 
output_B_dir = "./data/vcc2016_training/converted_sound/TF2/"
# =================================================
mini_batch_size = 1
dataset_A = loadPickleFile(coded_sps_A_norm)
dataset_B = loadPickleFile(coded_sps_B_norm)
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu')

# Speech Parameters
logf0s_normalization = np.load(logf0s_normalization)
log_f0s_mean_A = logf0s_normalization['mean_A']
log_f0s_std_A = logf0s_normalization['std_A']
log_f0s_mean_B = logf0s_normalization['mean_B']
log_f0s_std_B = logf0s_normalization['std_B']

mcep_normalization = np.load(mcep_normalization)
coded_sps_A_mean = mcep_normalization['mean_A']
coded_sps_A_std = mcep_normalization['std_A']
coded_sps_B_mean = mcep_normalization['mean_B']
coded_sps_B_std = mcep_normalization['std_B']

# Encoder and Decoder
encoder_noCNNRNN = Encoder().to(device)
decoder_2A_noCNNRNN = Decoder().to(device)
decoder_2B_noCNNRNN = Decoder().to(device)

# Discriminator
#rnn_discriminator = AB_Discriminator_RNN(Cin=512).to(device)
realfake_discriminator_A = RealFake_Discriminator().to(device)
realfake_discriminator_B = RealFake_Discriminator().to(device)

# Loss Functions
criterion_mse = torch.nn.MSELoss()

# Initial learning rates
generator_lr = 0.0002
discriminator_lr = 0.0001

# Learning rate decay
generator_lr_decay = generator_lr / 200000
discriminator_lr_decay = discriminator_lr / 200000

# Optimizers
def get_optimizer(one_module, learning_rate):
    return torch.optim.Adam(list(one_module.parameters()), lr=learning_rate, betas=(0.5, 0.999))

encoder_noCNNRNN_optimizer = get_optimizer(encoder_noCNNRNN, generator_lr)

decoder_2A_noCNNRNN_optimizer = get_optimizer(decoder_2A_noCNNRNN, generator_lr)
decoder_2B_noCNNRNN_optimizer = get_optimizer(decoder_2B_noCNNRNN, generator_lr)

generators_optimizers = [
    encoder_noCNNRNN_optimizer,
    decoder_2A_noCNNRNN_optimizer,
    decoder_2B_noCNNRNN_optimizer
]
realfake_discriminator_A_optimizer = get_optimizer(realfake_discriminator_A, discriminator_lr)
realfake_discriminator_B_optimizer = get_optimizer(realfake_discriminator_B, discriminator_lr)
# rnn_discriminator_optimizer = get_optimizer(rnn_discriminator, discriminator_lr)

# Storing Discriminatior and Generator Loss
generator_cycle_loss_store = []
generator_identity_loss_store = []
generator_supervise_loss_store = []
RF_discriminator_loss_store = []
embedding_loss_store = []

start_epoch = 0

# To Load save previously saved models
# load_path = "/home/frank/Documents/experiments/model3_supervise/0/checkpoint"
# start_epoch = loadModel(load_path)

In [5]:
def generators_reset_grad():
    for opt in generators_optimizers:
        opt.zero_grad()

def generators_update():
    for opt in generators_optimizers:
        opt.step()

def encoders_forward(voice_noCNNRNN):
    embedding_noCNNRNN = encoder_noCNNRNN(voice_noCNNRNN)
    return embedding_noCNNRNN

def decoders_forward(embedding_noCNNRNN, isToA=True):
    decoder_noCNNRNN = decoder_2A_noCNNRNN if isToA else decoder_2B_noCNNRNN
    fake_noCNNRNN = decoder_noCNNRNN(embedding_noCNNRNN, encoder_noCNNRNN.downsample2_forshape.shape)
    return fake_noCNNRNN

def AB_discriminator_zero_grad():
    rnn_discriminator_optimizer.zero_grad()

def AB_discriminator_update():
    rnn_discriminator_optimizer.step()

def AB_discriminator_forward_loss(emb_noCNNRNN, isA=True, gradients_for_discriminator=True):    
    true_pred = 0.0 if isA else 1.0
    target_rnn_00 = true_pred if gradients_for_discriminator else 0.5
    
    pred_noCNNRNN_rnn_discriminator = rnn_discriminator(emb_noCNNRNN)
    
    d_loss = torch.mean((target_rnn_00 - pred_noCNNRNN_rnn_discriminator) ** 2)
    return d_loss

def realfake_discriminator_zero_grad(isA=True):
    if isA:
        realfake_discriminator_A_optimizer.zero_grad()
    else:
        realfake_discriminator_B_optimizer.zero_grad()

def realfake_discriminator_update(isA=True):
    if isA:
        realfake_discriminator_A_optimizer.step()
    else:
        realfake_discriminator_B_optimizer.step()
    
def realfake_discriminator_forward_loss(fake_noCNNRNN, real, isA=True, gradients_for_discriminator=True):
    realfake_discriminator_zero_grad(isA=isA)
    realfake_discriminator = realfake_discriminator_A if isA else realfake_discriminator_B
    
    pred_noCNNRNN = realfake_discriminator(fake_noCNNRNN)
    d_loss = torch.mean((0.0 - pred_noCNNRNN) ** 2)
    
    # if we are running this func for generator gradients, skip real input
    # because gradient doesn't flow to generators
    if gradients_for_discriminator: 
        pred_real = realfake_discriminator(real)
        d_loss_real = torch.mean((1.0 - pred_real) ** 2)
        d_loss = (d_loss + d_loss_real) / 2.0
    
    return d_loss

def compute_identity_loss(fake, real):
    B1, C1, T1 = fake.shape
    B2, C2, T2 = real.shape
    T = min(T1, T2)
    fake_common = fake[:,:,:T]
    real_common = real[:,:,:T]
    identityLoss_noCNNRNN = torch.mean(torch.abs(fake_common - real_common))
    return identityLoss_noCNNRNN

def compute_embedding_identity_loss(emb1, emb2):
    B1, C1, T1 = emb1.shape
    B2, C2, T2 = emb2.shape
    T = min(T1, T2)
    emb1_common = emb1[:,:,:T]
    emb2_common = emb2[:,:,:T]
    emb1_common = emb1_common / torch.norm(emb1_common, p=2)
    emb2_common = emb2_common / torch.norm(emb2_common, p=2)
    emb_loss = -torch.sum(emb1_common * emb2_common) # negative cosine similarity as the embedding loss
    return emb_loss 

def compute_cycle_loss(fake, real):
    B1, C1, T1 = fake.shape
    B2, C2, T2 = real.shape
    T = min(T1, T2)
    fake_common = fake[:,:,:T]
    real_common = real[:,:,:T]
    cycleLoss_noCNNRNN = torch.mean(torch.abs(fake_common - real_common))
    return cycleLoss_noCNNRNN

def generate(voice, isToA=True):
    embedding = encoder_noCNNRNN(voice)    
    decoder = decoder_2A_noCNNRNN if isToA else decoder_2B_noCNNRNN
    fake = decoder(embedding, encoder_noCNNRNN.downsample2_forshape.shape)
    return fake
    

def validation_for_A_dir(save_dir):
    num_mcep = 24
    sampling_rate = 16000
    frame_period = 5.0
    n_frames = 128

    print("Generating Validation Data B from A...")
    for file in os.listdir(validation_A_dir):
        filePath = os.path.join(validation_A_dir, file)
        wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True)
        wav = preprocess.wav_padding(wav=wav,
                                     sr=sampling_rate,
                                     frame_period=frame_period,
                                     multiple=4)
        f0, timeaxis, sp, ap = preprocess.world_decompose(
            wav=wav, fs=sampling_rate, frame_period=frame_period)
        f0_converted = preprocess.pitch_conversion(f0=f0,
                                                   mean_log_src=log_f0s_mean_A,
                                                   std_log_src=log_f0s_std_A,
                                                   mean_log_target=log_f0s_mean_B,
                                                   std_log_target=log_f0s_std_B)
        coded_sp = preprocess.world_encode_spectral_envelop(
            sp=sp, fs=sampling_rate, dim=num_mcep)
        coded_sp_transposed = coded_sp.T
        coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
        coded_sp_norm = np.array([coded_sp_norm])

        if torch.cuda.is_available():
            coded_sp_norm = torch.from_numpy(coded_sp_norm).cuda().float()
        else:
            coded_sp_norm = torch.from_numpy(coded_sp_norm).float()

        coded_sp_converted_norm = generate(coded_sp_norm, isToA=False)
        coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach().numpy()
        coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
        coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = preprocess.world_decode_spectral_envelop(
            coded_sp=coded_sp_converted, fs=sampling_rate)
        wav_transformed = preprocess.world_speech_synthesis(f0=f0_converted,
                                                            decoded_sp=decoded_sp_converted,
                                                            ap=ap,
                                                            fs=sampling_rate,
                                                            frame_period=frame_period)
        librosa.output.write_wav(path=os.path.join(save_dir, os.path.basename(file)),
                                 y=wav_transformed,
                                 sr=sampling_rate)    

def val_file_to_sp(filepath, isA=True):
    num_mcep = 24
    sampling_rate = 16000
    frame_period = 5.0
    n_frames = 128
    
    wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True)
    wav = preprocess.wav_padding(wav=wav,
                                 sr=sampling_rate,
                                 frame_period=frame_period,
                                 multiple=4)
    f0, timeaxis, sp, ap = preprocess.world_decompose(
        wav=wav, fs=sampling_rate, frame_period=frame_period)
    
    coded_sp = preprocess.world_encode_spectral_envelop(
        sp=sp, fs=sampling_rate, dim=num_mcep)
    coded_sp_transposed = coded_sp.T
    coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std if isA else \
                    (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std
    coded_sp_norm = np.array([coded_sp_norm])
    coded_sp_norm = torch.from_numpy(coded_sp_norm).float()
    
    return coded_sp_norm, f0, ap

def val_convert_f0(f0, isAtoB=True):
    if isAtoB:
        return preprocess.pitch_conversion(f0=f0,
                                           mean_log_src=log_f0s_mean_A,
                                           std_log_src=log_f0s_std_A,
                                           mean_log_target=log_f0s_mean_B,
                                           std_log_target=log_f0s_std_B)
    else:
        return preprocess.pitch_conversion(f0=f0,
                                           mean_log_src=log_f0s_mean_B,
                                           std_log_src=log_f0s_std_B,
                                           mean_log_target=log_f0s_mean_A,
                                           std_log_target=log_f0s_std_A)

def val_sp_to_file(output_file, coded_sp_converted_norm, f0_converted, ap, isA=True):
    num_mcep = 24
    sampling_rate = 16000
    frame_period = 5.0
    n_frames = 128
    
    coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean if isA else \
                         coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
    coded_sp_converted = coded_sp_converted.T
    coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
    decoded_sp_converted = preprocess.world_decode_spectral_envelop(
        coded_sp=coded_sp_converted, fs=sampling_rate)
    wav_transformed = preprocess.world_speech_synthesis(f0=f0_converted,
                                                        decoded_sp=decoded_sp_converted,
                                                        ap=ap,
                                                        fs=sampling_rate,
                                                        frame_period=frame_period)
    librosa.output.write_wav(path=output_file,
                             y=wav_transformed,
                             sr=sampling_rate)

def validation_for_B_dir(save_dir):

    print("Generating Validation Data A from B...")
    for file in os.listdir(validation_B_dir):
        filePath = os.path.join(validation_B_dir, file)
        coded_sp_norm, f0, ap = val_file_to_sp(filepath, isA=False)
        
        coded_sp_converted_norm = generate(coded_sp_norm.cuda(), isToA=True).cpu().detach().numpy()
        f0 = val_convert_f0(f0, isAtoB=False)

        output_file = os.path.join(save_dir, os.path.basename(file))
        val_sp_to_file(output_file, coded_sp_converted_norm, f0, ap, isA=True)

In [6]:
MODEL_NAME = "model3_supervise_converge"
MODEL_DIR = "./experiments/" + MODEL_NAME
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
file_name = MODEL_DIR + '/log.txt'

start_decay = 10
num_epochs = 5000

In [None]:
for epoch in range(start_epoch, num_epochs):
    start_time_epoch = time.time()

    # Constants
    cycle_loss_lambda = 10
    identity_loss_lambda = 5
    #if epoch>20:#
        #cycle_loss_lambda = 15#
        #identity_loss_lambda = 0#

    # Preparing Dataset
    n_samples = len(dataset_A)

    dataset = trainingDataset_Paired(datasetA=dataset_A,
                              datasetB=dataset_B, max_frames=400)
    train_loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=mini_batch_size,
                                               shuffle=True,
                                               drop_last=False)

    for i, (real_A, real_B) in enumerate(train_loader):

        num_iterations = (n_samples // mini_batch_size) * epoch + i
        # print("iteration no: ", num_iterations, epoch)

        if num_iterations > 10000:
            identity_loss_lambda = 0
        if num_iterations > start_decay:
            for generator_optimizer in generators_optimizers:
                adjust_lr_rate(
                    generator_optimizer, name='generator')
                adjust_lr_rate(
                    generator_optimizer, name='discriminator')
        
        real_A = real_A.to(device).float()
        real_B = real_B.to(device).float()
        
        for (real1, real2, real1_is_A) in [(real_A, real_B, True), (real_B, real_A, False)]:
            real2_is_A = not real1_is_A
            
            reset_grad();
            
            # ----------------------------------------------------------------
            # First compute embeddings for A and B
            # real_A and real_B are saying the same sentence
            # compute embedding identity loss
            embedding2 = encoders_forward(real2) # must execute 2 before 1 because we want decoder shape to be right
            embedding1 = encoders_forward(real1)
            embedding_identity_loss = compute_embedding_identity_loss(embedding1, embedding2)
            
            # ----------------------------------------------------------------
            # full forward pass and compute loss for improving generators only
            # then backward and update generators
            
#             # adversarial A/B loss
#             adversarial_AB_loss = AB_discriminator_forward_loss(embeddings1, 
#                                                                 isA=real1_is_A, 
#                                                                 gradients_for_discriminator=False)
            
            # identity loss
            fake11 = decoders_forward(embedding1, isToA=real1_is_A)
            identity_loss = compute_identity_loss(fake11, real1)
            
            # adversarial real/fake loss
            fake12 = decoders_forward(embedding1, isToA=real2_is_A)
            adversarial_RealFake_loss = realfake_discriminator_forward_loss(fake12, real2, isA=real2_is_A,
                                                                            gradients_for_discriminator=False)
            
            # supervise loss
            supervise_loss = compute_cycle_loss(fake12, real2)
            
            # cycle consistency loss
            embedding12 = encoders_forward(fake12)
            fake121 = decoders_forward(embedding12, isToA=real1_is_A)
            cycle_loss = compute_cycle_loss(fake121, real1)
            
            # compute total generator loss
            total_generator_loss = embedding_identity_loss \
                                 + 0.1 * identity_loss \
                                 + supervise_loss \
                                 + adversarial_RealFake_loss \
                                 + 0.1 * cycle_loss
            generator_cycle_loss_store.append(cycle_loss.item())
            generator_identity_loss_store.append(identity_loss.item())
            generator_supervise_loss_store.append(supervise_loss.item())
            embedding_loss_store.append(embedding_identity_loss.item())
            
            # backward and update generators only
            total_generator_loss.backward()
            generators_update()
            
            # --------------------------------------------
            # detach embeddings and fakes12 tensors from autograd
            # then forward and backward the discriminators only on detached embeddings and fakes12 tensors
            # gradients won't flow back to encoders or decoders
            
#             AB_discriminator_zero_grad()
# #             embeddings1 = (embeddings1[0].detach(), embeddings1[1].detach(), embeddings1[2].detach())
#             embeddings1 = (None, None, embeddings1[2].detach())
#             d_AB_loss = AB_discriminator_forward_loss(embeddings1, 
#                                                       isA=real1_is_A, 
#                                                       gradients_for_discriminator=True)
#             d_AB_loss.backward()
#             AB_discriminator_update()
            
            realfake_discriminator_zero_grad(isA=real2_is_A)
            fake12 = fake12.detach()
            d_RealFake_loss = realfake_discriminator_forward_loss(fake12, real2, isA=real2_is_A,
                                                                  gradients_for_discriminator=True)
            if d_RealFake_loss.item() > 0.1: # only update RF discriminator if generator is good enough
                d_RealFake_loss.backward()
                realfake_discriminator_update(isA=real2_is_A)

            RF_discriminator_loss_store.append(d_RealFake_loss.item())
    
    # end of epoch
    if epoch % 10 == 0:
        save_dir = "/home/frank/Documents/experiments/"+MODEL_NAME+"/"+str(epoch)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        validation_for_A_dir(save_dir)
        store_to_file_str = \
            "Epoch: {}\t".format(epoch) + \
            "Loss/ Cycle: {:.4f}\t".format(generator_cycle_loss_store[-1]) + \
            "Identity: {:.4f}\t".format(generator_identity_loss_store[-1]) + \
            "Supervise: {:.4f}\t".format(generator_supervise_loss_store[-1]) + \
            "Emb_Identity: {:.4f}\t".format(embedding_loss_store[-1]) + \
            "RF_d: {:.4f}".format(RF_discriminator_loss_store[-1])
        print(store_to_file_str)
        store_to_file(store_to_file_str)

        # Save the Entire model
        saveModelCheckPoint(epoch, save_dir+"/checkpoint")

        

Generating Validation Data B from A...


  std_log_src * std_log_target + mean_log_target)


Epoch: 0	Loss/ Cycle: 0.7600	Identity: 0.7625	Supervise: 0.7668	Emb_Identity: -0.9708	RF_d: 0.0486
Generating Validation Data B from A...
Epoch: 10	Loss/ Cycle: 0.8069	Identity: 0.8126	Supervise: 0.8027	Emb_Identity: -0.9994	RF_d: 0.0339
Generating Validation Data B from A...
Epoch: 20	Loss/ Cycle: 0.7620	Identity: 0.8374	Supervise: 0.8225	Emb_Identity: -0.9996	RF_d: 0.0290
Generating Validation Data B from A...
Epoch: 30	Loss/ Cycle: 0.7959	Identity: 0.7962	Supervise: 0.7830	Emb_Identity: -0.9996	RF_d: 0.0246
Generating Validation Data B from A...
Epoch: 40	Loss/ Cycle: 0.8341	Identity: 0.7795	Supervise: 0.7707	Emb_Identity: -0.9957	RF_d: 0.0238
Generating Validation Data B from A...
Epoch: 50	Loss/ Cycle: 0.7290	Identity: 0.7840	Supervise: 0.7879	Emb_Identity: -0.9994	RF_d: 0.0262
Generating Validation Data B from A...
Epoch: 60	Loss/ Cycle: 0.6778	Identity: 0.8151	Supervise: 0.8619	Emb_Identity: -0.9995	RF_d: 0.0193
Generating Validation Data B from A...
Epoch: 70	Loss/ Cycle: 0.629

In [None]:
save_dir = "/home/frank/Documents/experiments/"+MODEL_NAME+"/150test/B2A"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
validation_for_B_dir(save_dir)

In [None]:
save_dir = "/home/frank/Documents/experiments/"+MODEL_NAME+"/doubletest"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
file = "200007.wav"

filepath = os.path.join(validation_A_dir, file)
real_A, f0_A, ap_A = val_file_to_sp(filepath, isA=True)
f0_AB = val_convert_f0(f0_A, isAtoB=True)

filepath = os.path.join(validation_B_dir, file)
real_B, f0_B, ap_B = val_file_to_sp(filepath, isA=False)
f0_BA = val_convert_f0(f0_B, isAtoB=False)

#print(real_A - real_B)
# Now we have real_A, real_B, f0_A, f0_B, f0_AB, f0_BA

emb_A = encoders_forward(real_A.cuda())
fakeAA = decoders_forward(emb_A, isToA=True)
fakeAB = decoders_forward(emb_A, isToA=False)
emb_B = encoders_forward(real_B.cuda())
fakeBB = decoders_forward(emb_B, isToA=False)
fakeBA = decoders_forward(emb_B, isToA=True)
output_file = os.path.join(save_dir, "07_AA.wav")
sps = fakeAA.cpu().detach().numpy()
val_sp_to_file(output_file, sps, f0_A, ap_A, isA=True)
val_sp_to_file(os.path.join(save_dir, "07_AB.wav"),  fakeAB.cpu().detach().numpy(), f0_AB, ap_A, isA=False)
val_sp_to_file(os.path.join(save_dir, "07_BB.wav"),  fakeBB.cpu().detach().numpy(), f0_B, ap_B, isA=False)
val_sp_to_file(os.path.join(save_dir, "07_BA.wav"),  fakeBA.cpu().detach().numpy(), f0_BA, ap_B, isA=True)

val_sp_to_file(os.path.join(save_dir, "07_realA.wav"),  real_A.numpy(), f0_A, ap_A, isA=True)
val_sp_to_file(os.path.join(save_dir, "07_realB.wav"),  real_B.numpy(), f0_B, ap_B, isA=False)

In [None]:
validation_A_dir