In [None]:
import os
import numpy as np
import argparse
import torch
import time
import librosa
import pickle

import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
import pdb

import preprocess
from trainingDataset import trainingDataset
from model_VC2 import Generator, Discriminator
import torch.utils.tensorboard

In [None]:
torch.__version__

In [None]:
# Models
class GLU(nn.Module):
    def __init__(self):
        super(GLU, self).__init__()
        # Custom Implementation because the Voice Conversion Cycle GAN
        # paper assumes GLU won't reduce the dimension of tensor by 2.

    def forward(self, input):
        return input * torch.sigmoid(input)


class up_2Dsample(nn.Module):
    def __init__(self, upscale_factor=2):
        super(up_2Dsample, self).__init__()
        self.scale_factor = upscale_factor

    def forward(self, input):
        h = input.shape[2]
        w = input.shape[3]
        new_size = [h * self.scale_factor, w * self.scale_factor]
        return F.interpolate(input,new_size)
       

class PixelShuffle(nn.Module):
    def __init__(self, upscale_factor=2):
        super(PixelShuffle, self).__init__()
        # Custom Implementation because PyTorch PixelShuffle requires,
        # 4D input. Whereas, in this case we have have 3D array
        self.upscale_factor = upscale_factor

    def forward(self, input):
        n = input.shape[0]
        c_out = input.shape[1] // self.upscale_factor
        w_new = input.shape[2] * self.upscale_factor
        return input.view(n, c_out, w_new)


class ResidualLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ResidualLayer, self).__init__()
        self.conv1d_layer = nn.Sequential(nn.Conv1d(in_channels=in_channels,
                                                    out_channels=out_channels,
                                                    kernel_size=kernel_size,
                                                    stride=1,
                                                    padding=padding),
                                          nn.InstanceNorm1d(num_features=out_channels,
                                                            affine=True))

        self.conv_layer_gates = nn.Sequential(nn.Conv1d(in_channels=in_channels,
                                                        out_channels=out_channels,
                                                        kernel_size=kernel_size,
                                                        stride=1,
                                                        padding=padding),
                                              nn.InstanceNorm1d(num_features=out_channels,
                                                                affine=True))

        self.conv1d_out_layer = nn.Sequential(nn.Conv1d(in_channels=out_channels,
                                                        out_channels=in_channels,
                                                        kernel_size=kernel_size,
                                                        stride=1,
                                                        padding=padding),
                                              nn.InstanceNorm1d(num_features=in_channels,
                                                                affine=True))

    def forward(self, input):
        #print("input size: ", input.size())
        h1_norm = self.conv1d_layer(input)
        h1_gates_norm = self.conv_layer_gates(input)

        # GLU
        h1_glu = h1_norm * torch.sigmoid(h1_gates_norm)

        h2_norm = self.conv1d_out_layer(h1_glu)
        return input + h2_norm


class downSample_Generator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(downSample_Generator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       nn.InstanceNorm2d(num_features=out_channels,
                                                         affine=True))
        self.convLayer_gates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                       out_channels=out_channels,
                                                       kernel_size=kernel_size,
                                                       stride=stride,
                                                       padding=padding),
                                             nn.InstanceNorm2d(num_features=out_channels,
                                                               affine=True))

    def forward(self, input):
        a = self.convLayer(input)
        b = self.convLayer_gates(input)
        return self.convLayer(input) * torch.sigmoid(self.convLayer_gates(input))


class upSample_Generator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(upSample_Generator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       #PixelShuffle(upscale_factor=2),
                                       up_2Dsample(upscale_factor=2),
        nn.InstanceNorm2d(num_features=out_channels, affine=True))
        self.convLayer_gates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                       out_channels=out_channels,
                                                       kernel_size=kernel_size,
                                                       stride=stride,
                                                       padding=padding),
                                             #PixelShuffle(upscale_factor=2),
                                             up_2Dsample(upscale_factor=2),
                                             nn.InstanceNorm2d(num_features=out_channels,
                                                               affine=True))
    def forward(self, input):        
        return self.convLayer(input) * torch.sigmoid(self.convLayer_gates(input))


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=128,
                               kernel_size=[5,15],
                               stride=1,
                               padding=[2,7])

        self.conv1_gates = nn.Conv2d(in_channels=1,
                               out_channels=128,
                               kernel_size=[5,15],
                               stride=1,
                               padding=[2,7])

        # Downsample Layer
        self.downSample1 = downSample_Generator(in_channels=128,
                                                out_channels=256,
                                                kernel_size=5,
                                                stride=2,
                                                padding=2)

        self.downSample2 = downSample_Generator(in_channels=256,
                                                out_channels=512,
                                                kernel_size=5,
                                                stride=2,
                                                padding=2)
        #reshape
        self.conv2 = nn.Conv1d(in_channels=3072,
                               out_channels=512,
                               kernel_size=1,
                               stride=1)

        # Residual Blocks
        self.residualLayer1 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer2 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer3 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
    def forward(self, input):
        # GLU
        input = input.unsqueeze(1)
        conv1 = self.conv1(input) * torch.sigmoid(self.conv1_gates(input))
        # print("shape of conv1, ", conv1.size())
        downsample1 = self.downSample1(conv1)
        # print("shape of downsample1, ", downsample1.size())
        self.downsample2_forshape = self.downSample2(downsample1)
        downsample3 = self.downsample2_forshape.view([self.downsample2_forshape.shape[0],-1,self.downsample2_forshape.shape[3]])
        downsample3 = self.conv2(downsample3)
        # print("shape of downsample3, ", downsample3.size())
        residual_layer_1 = self.residualLayer1(downsample3)
        residual_layer_2 = self.residualLayer2(residual_layer_1)
        # print("shape of residual_layer_2, ", residual_layer_2.size())
        residual_layer_3 = self.residualLayer3(residual_layer_2)
        # print("shape of residual_layer_3, ", residual_layer_3.size())
        return residual_layer_3

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.residualLayer4 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer5 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        self.residualLayer6 = ResidualLayer(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=3,
                                            stride=1,
                                            padding=1)
        #reshape
        self.conv3 = nn.Conv1d(in_channels=512,
                               out_channels=3072,
                               kernel_size=1,
                               stride=1)


        # UpSample Layer
        self.upSample1 = upSample_Generator(in_channels=512,
                                            out_channels=1024,
                                            kernel_size=5,
                                            stride=1,
                                            padding=2)
        
        self.upSample2 = upSample_Generator(in_channels=1024,
                                            out_channels=512,
                                            kernel_size=5,
                                            stride=1,
                                            padding=2)

        self.lastConvLayer = nn.Conv2d(in_channels=512,
                                       out_channels=1,
                                       kernel_size=[5,15],
                                       stride=1,
                                       padding=[2,7])

    def forward(self, input, shapes):
        # GLU
        residual_layer_4 = self.residualLayer4(input)
        residual_layer_5 = self.residualLayer5(residual_layer_4)
        residual_layer_6 = self.residualLayer6(residual_layer_5)
        residual_layer_6 = self.conv3(residual_layer_6)
        residual_layer_6 = residual_layer_6.view([shapes[0],shapes[1],shapes[2],shapes[3]])
        
        upSample_layer_1 = self.upSample1(residual_layer_6)
        upSample_layer_2 = self.upSample2(upSample_layer_1)
        output = self.lastConvLayer(upSample_layer_2)
        output = output.view([output.shape[0],-1,output.shape[3]])
        return output


class CNN_Discriminator(nn.Module):
    def __init__(self, Cin=1024, Tmax=256): # all inputs are padded to Tmax 
        super(CNN_Discriminator, self).__init__()
        self.Tmax = Tmax
        Conv_dim = 256
        self.conv = nn.Conv1d(in_channels=Cin,
                              out_channels=Conv_dim,
                              kernel_size=3,
                              stride=1)
        self.conv_2 = nn.Conv1d(in_channels=Conv_dim,
                            out_channels=Conv_dim,
                            kernel_size=3,
                            stride=1,
                            padding=1)
        self.conv_3 = nn.Conv1d(in_channels=Conv_dim,
                            out_channels=Conv_dim,
                            kernel_size=3,
                            stride=1,
                            padding=1)
        self.avgPool = nn.AvgPool1d(kernel_size=Tmax-2)
        self.linear = nn.Linear(Conv_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x): # input tensor in shape (B, Cin, Tin)
        # print("input size at discriminator ,", x.size())
        B, Cin, Tin = x.shape
        x = x.unsqueeze(1) # (B, 1, Cin, Tin)
        # print("input size after unsqueeze ,", x.size())
        Tpad_right = self.Tmax - Tin
        padder = nn.ZeroPad2d((0,Tpad_right,0,0))
        x = padder(x) # (B, 1, Cin, Tmax)
        # print("input size after pad ,", x.size())
        x = x.squeeze() # (B, Cin, Tmax)
        # print("input size before conv ,", x.size())
        x = self.conv(x) # (B, Conv_dim, Tmax-2)
        x = self.conv_2(x)
        x = self.conv_3(x)
        x = self.avgPool(x) # (B, Conv_dim, 1)
        x = x.squeeze() # (B, Conv_dim)
        x = self.linear(x) # (B, 1)
        x = self.sigmoid(x)
        return x
    
class RNN_Discriminator(nn.Module):
    def __init__(self, Cin=1024): # all inputs are padded to Tmax 
        super(RNN_Discriminator, self).__init__()
        Conv_dim = 256
        self.conv = nn.Conv1d(in_channels=Cin,
                              out_channels=Conv_dim,
                              kernel_size=3,
                              stride=1)
        hidden = 256
        self.gru = nn.GRU(input_size=Conv_dim, 
                          hidden_size=hidden,
                         num_layers=3)
        self.linear = nn.Linear(hidden, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x): # input tensor in shape (B, Cin, Tin)
        B, Cin, Tin = x.shape
        x = self.conv(x) # (B, Conv_dim, Tin-2)
        x = x.permute(2, 0, 1) # (Tin-2, B, Conv_dim)
        x, hout = self.gru(x) # hout: (1, B, hidden)
        hout = hout.squeeze() # hout: (B, hidden)
        hout = self.linear(hout) # (B, 1)
        hout = self.sigmoid(hout) # (B, 1)
        return hout



class DownSample_Discriminator(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(DownSample_Discriminator, self).__init__()

        self.convLayer = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                 out_channels=out_channels,
                                                 kernel_size=kernel_size,
                                                 stride=stride,
                                                 padding=padding),
                                       nn.InstanceNorm2d(num_features=out_channels,
                                                         affine=True))
        self.convLayerGates = nn.Sequential(nn.Conv2d(in_channels=in_channels,
                                                      out_channels=out_channels,
                                                      kernel_size=kernel_size,
                                                      stride=stride,
                                                      padding=padding),
                                            nn.InstanceNorm2d(num_features=out_channels,
                                                              affine=True))

    def forward(self, input):
        # GLU
        return self.convLayer(input) * torch.sigmoid(self.convLayerGates(input))
        

class RealFake_Discriminator(nn.Module):
    def __init__(self):
        super(RealFake_Discriminator, self).__init__()

        self.convLayer1 = nn.Conv2d(in_channels=1,
                                    out_channels=128,
                                    kernel_size=[3, 3],
                                    stride=[1, 1])
        self.convLayer1_gates = nn.Conv2d(in_channels=1,
                                          out_channels=128,
                                          kernel_size=[3, 3],
                                          stride=[1, 1])

        # Note: Kernel Size have been modified in the PyTorch implementation
        # compared to the actual paper, as to retain dimensionality. Unlike,
        # TensorFlow, PyTorch doesn't have padding='same', hence, kernel sizes
        # were altered to retain the dimensionality after each layer

        # DownSample Layer
        self.downSample1 = DownSample_Discriminator(in_channels=128,
                                                    out_channels=256,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample2 = DownSample_Discriminator(in_channels=256,
                                                    out_channels=512,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample3 = DownSample_Discriminator(in_channels=512,
                                                    out_channels=1024,
                                                    kernel_size=[3, 3],
                                                    stride=[2, 2],
                                                    padding=0)

        self.downSample4 = DownSample_Discriminator(in_channels=1024,
                                                    out_channels=1024,
                                                    kernel_size=[1, 5],
                                                    stride=[1, 1],
                                                    padding=[0, 2])

        # Fully Connected Layer
        self.fc = nn.Linear(in_features=1024,
                            out_features=1)

        # output Layer
        self.output_layer = nn.Conv2d(in_channels=1024,
                                      out_channels=1,
                                      kernel_size=[1, 3],
                                      stride=[1, 1],
                                      padding=[0, 1])

    def forward(self, input):
        # input has shape [batch_size, num_features, time]
        # discriminator requires shape [batchSize, 1, num_features, time]
        input = input.unsqueeze(1)
        # GLU
        pad_input = nn.ZeroPad2d((1, 1, 1, 1))
        layer1 = self.convLayer1(
            pad_input(input)) * torch.sigmoid(self.convLayer1_gates(pad_input(input)))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample1 = self.downSample1(pad_input(layer1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample2 = self.downSample2(pad_input(downSample1))

        pad_input = nn.ZeroPad2d((1, 0, 1, 0))
        downSample3 = self.downSample3(pad_input(downSample2))

        downSample4 = self.downSample4(downSample3)
        downSample4 = self.output_layer(downSample4)

        downSample4 = downSample4.contiguous().permute(0, 2, 3, 1).contiguous()
        # fc = torch.sigmoid(self.fc(downSample3))
        # Taking off sigmoid layer to avoid vanishing gradient problem
        #fc = self.fc(downSample4)
        fc = torch.sigmoid(downSample4)
        return fc

In [None]:
# Helper functions

def adjust_lr_rate(optimizer, name='generator'):
    global generator_lr, generator_lr_decay, discriminator_lr, discriminator_lr_decay
    if name == 'generator':
        generator_lr = max(
            0., generator_lr - generator_lr_decay)
        for param_groups in optimizer.param_groups:
            param_groups['lr'] = generator_lr
    else:
        discriminator_lr = max(
            0., discriminator_lr - discriminator_lr_decay)
        for param_groups in optimizer.param_groups:
            param_groups['lr'] = discriminator_lr

def reset_grad():
    encoder_noCNN_optimizer.zero_grad()
    encoder_noRNN_optimizer.zero_grad()
    encoder_noCNNRNN_optimizer.zero_grad()
    
    decoder_2A_noCNN_optimizer.zero_grad()
    decoder_2A_noRNN_optimizer.zero_grad()
    decoder_2A_noCNNRNN_optimizer.zero_grad()
    decoder_2B_noCNN_optimizer.zero_grad()
    decoder_2B_noRNN_optimizer.zero_grad()
    decoder_2B_noCNNRNN_optimizer.zero_grad()
    
    A_realfake_discriminator_optimizer.zero_grad()
    B_realfake_discriminator_optimizer.zero_grad()
    cnn_discriminator_optimizer.zero_grad()
    rnn_discriminator_optimizer.zero_grad()

def savePickle(variable, fileName):
    with open(fileName, 'wb') as f:
        pickle.dump(variable, f)

def loadPickleFile(fileName):
    with open(fileName, 'rb') as f:
        return pickle.load(f)

def store_to_file(doc):
    doc = doc + "\n"
    with open(file_name, "a") as myfile:
        myfile.write(doc)

def saveModelCheckPoint(self, epoch, PATH):
     torch.save({
         'epoch': epoch,
         'generator_loss_store': generator_loss_store,
         'discriminator_loss_store': discriminator_loss_store,
         'model_genA2B_state_dict': generator_A2B.state_dict(),
         'model_genB2A_state_dict': generator_B2A.state_dict(),
         'model_discriminatorA': discriminator_A.state_dict(),
         'model_discriminatorB': discriminator_B.state_dict(),
         'generator_optimizer': generator_optimizer.state_dict(),
         'discriminator_optimizer': discriminator_optimizer.state_dict()
     }, PATH)

# def loadModel(PATH):
#     checkPoint = torch.load(PATH)
#     generator_A2B.load_state_dict(
#         state_dict=checkPoint['model_genA2B_state_dict'])
#     generator_B2A.load_state_dict(
#         state_dict=checkPoint['model_genB2A_state_dict'])
#     discriminator_A.load_state_dict(
#         state_dict=checkPoint['model_discriminatorA'])
#     discriminator_B.load_state_dict(
#         state_dict=checkPoint['model_discriminatorB'])
#     generator_optimizer.load_state_dict(
#         state_dict=checkPoint['generator_optimizer'])
#     discriminator_optimizer.load_state_dict(
#         state_dict=checkPoint['discriminator_optimizer'])
#     epoch = int(checkPoint['epoch']) + 1
#     generator_loss_store = checkPoint['generator_loss_store']
#     discriminator_loss_store = checkPoint['discriminator_loss_store']
#     return epoch

In [None]:
# set up
logf0s_normalization = "./cache/logf0s_normalization.npz"
mcep_normalization = "./cache/mcep_normalization.npz"
coded_sps_A_norm = "./cache/coded_sps_A_norm.pickle"
coded_sps_B_norm = "./cache/coded_sps_B_norm.pickle" 
resume_training_at = "./cache/model_checkpoint/_CycleGAN_CheckPoint"
validation_A_dir = "./data/vcc2016_training/evaluation_all/SF1/" 
output_A_dir = "./data/vcc2016_training/converted_sound/SF1"
validation_B_dir = "./data/vcc2016_training/evaluation_all/TF2/" 
output_B_dir = "./data/vcc2016_training/converted_sound/TF2/"
all_dir = [resume_training_at, validation_A_dir, output_A_dir, validation_B_dir, output_B_dir]
# =================================================
start_epoch = 0
num_epochs = 5000
mini_batch_size = 8
dataset_A = loadPickleFile(coded_sps_A_norm)
dataset_B = loadPickleFile(coded_sps_B_norm)
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu')

# Speech Parameters
logf0s_normalization = np.load(logf0s_normalization)
log_f0s_mean_A = logf0s_normalization['mean_A']
log_f0s_std_A = logf0s_normalization['std_A']
log_f0s_mean_B = logf0s_normalization['mean_B']
log_f0s_std_B = logf0s_normalization['std_B']

mcep_normalization = np.load(mcep_normalization)
coded_sps_A_mean = mcep_normalization['mean_A']
coded_sps_A_std = mcep_normalization['std_A']
coded_sps_B_mean = mcep_normalization['mean_B']
coded_sps_B_std = mcep_normalization['std_B']

# Encoder and Decoder
encoder_noCNN = Encoder().to(device)
encoder_noRNN = Encoder().to(device)
encoder_noCNNRNN = Encoder().to(device)
decoder_2A_noCNN = Decoder().to(device)
decoder_2A_noRNN = Decoder().to(device)
decoder_2A_noCNNRNN = Decoder().to(device)
decoder_2B_noCNN = Decoder().to(device)
decoder_2B_noRNN = Decoder().to(device)
decoder_2B_noCNNRNN = Decoder().to(device)

# Discriminator
cnn_discriminator = CNN_Discriminator(Cin=512, Tmax=256).to(device)
rnn_discriminator = RNN_Discriminator(Cin=512).to(device)
A_realfake_discriminator = RealFake_Discriminator().to(device)
B_realfake_discriminator = RealFake_Discriminator().to(device)

# Loss Functions
criterion_mse = torch.nn.MSELoss()

# Initial learning rates
generator_lr = 0.0002
discriminator_lr = 0.0001
realfake_discriminator_lr = 0.0001

# Learning rate decay
generator_lr_decay = generator_lr / 200000
discriminator_lr_decay = discriminator_lr / 200000

# Optimizers
def get_optimizer(one_module, learning_rate):
    return torch.optim.Adam(list(one_module.parameters()), lr=learning_rate, betas=(0.5, 0.999))

encoder_noCNN_optimizer = get_optimizer(encoder_noCNN, generator_lr)
encoder_noRNN_optimizer = get_optimizer(encoder_noRNN, generator_lr)
encoder_noCNNRNN_optimizer = get_optimizer(encoder_noCNNRNN, generator_lr)

decoder_2A_noCNN_optimizer = get_optimizer(decoder_2A_noCNN, generator_lr)
decoder_2A_noRNN_optimizer = get_optimizer(decoder_2A_noRNN, generator_lr)
decoder_2A_noCNNRNN_optimizer = get_optimizer(decoder_2A_noCNNRNN, generator_lr)
decoder_2B_noCNN_optimizer = get_optimizer(decoder_2B_noCNN, generator_lr)
decoder_2B_noRNN_optimizer = get_optimizer(decoder_2B_noRNN, generator_lr)
decoder_2B_noCNNRNN_optimizer = get_optimizer(decoder_2B_noCNNRNN, generator_lr)

generators_optimizers = [
    encoder_noCNN_optimizer,
    encoder_noRNN_optimizer,
    encoder_noCNNRNN_optimizer,
    decoder_2A_noCNN_optimizer,
    decoder_2A_noRNN_optimizer,
    decoder_2A_noCNNRNN_optimizer,
    decoder_2B_noCNN_optimizer,
    decoder_2B_noRNN_optimizer,
    decoder_2B_noCNNRNN_optimizer
]


A_realfake_discriminator_optimizer = get_optimizer(A_realfake_discriminator, realfake_discriminator_lr)
B_realfake_discriminator_optimizer = get_optimizer(B_realfake_discriminator, realfake_discriminator_lr)
cnn_discriminator_optimizer = get_optimizer(cnn_discriminator, discriminator_lr)
rnn_discriminator_optimizer = get_optimizer(rnn_discriminator, discriminator_lr)


# To Load save previously saved models
#modelCheckpoint = model_checkpoint

# Validation set Parameters
validation_A_dir = validation_A_dir
output_A_dir = output_A_dir
validation_B_dir = validation_B_dir
output_B_dir = output_B_dir

# Storing Discriminatior and Generator Loss
generator_loss_store = []
discriminator_loss_store = []
cnn_discriminator_loss_store = []
rnn_discriminator_loss_store = []

file_name = 'log_store_non_sigmoid.txt'
start_epoch = 0
'''
if restart_training_at is not None:
    # Training will resume from previous checkpoint
    start_epoch = loadModel(restart_training_at)
    print("Training resumed")
'''

In [None]:
def generators_reset_grad():
    for opt in generators_optimizers:
        opt.zero_grad()

def generators_update():
    for opt in generators_optimizers:
        opt.step()

def encoders_forward(voices):
    voice_noCNN, voice_noRNN, voice_noCNNRNN = voices

    embedding_noCNN = encoder_noCNN(voice_noCNN)
    embedding_noRNN = encoder_noRNN(voice_noRNN)
    embedding_noCNNRNN = encoder_noCNNRNN(voice_noCNNRNN)
    embeddings = (embedding_noCNN, embedding_noRNN, embedding_noCNNRNN)
    return embeddings

def decoders_forward(embeddings, isToA=True):
    embedding_noCNN, embedding_noRNN, embedding_noCNNRNN = embeddings
    
    decoder_noCNN, decoder_noRNN, decoder_noCNNRNN = \
        (decoder_2A_noCNN, decoder_2A_noRNN, decoder_2A_noCNNRNN) if isToA else \
        (decoder_2B_noCNN, decoder_2B_noRNN, decoder_2B_noCNNRNN)
    
    fake_noCNN = decoder_noCNN(embedding_noCNN, encoder_noCNN.downsample2_forshape.shape)
    fake_noRNN = decoder_noRNN(embedding_noRNN, encoder_noRNN.downsample2_forshape.shape)
    fake_noCNNRNN = decoder_noCNNRNN(embedding_noCNNRNN, encoder_noCNNRNN.downsample2_forshape.shape)
    fakes = (fake_noCNN, fake_noRNN, fake_noCNNRNN)

    return fakes

def AB_discriminator_zero_grad():
    cnn_discriminator_optimizer.zero_grad()
    rnn_discriminator_optimizer.zero_grad()

def AB_discriminator_update():
    cnn_discriminator_optimizer.step()
    rnn_discriminator_optimizer.step()

def AB_discriminator_forward_loss(embeddings, isA=True, gradients_for_discriminator=True):
    emb_noCNN, emb_noRNN, emb_noCNNRNN = embeddings
    
    true_pred = 0.0 if isA else 1.0
    
    target_cnn_01 = true_pred if gradients_for_discriminator else 0.5
    target_cnn_10 = true_pred if gradients_for_discriminator else true_pred
    target_cnn_00 = true_pred if gradients_for_discriminator else 0.5
    
    target_rnn_01 = true_pred if gradients_for_discriminator else true_pred
    target_rnn_10 = true_pred if gradients_for_discriminator else 0.5
    target_rnn_00 = true_pred if gradients_for_discriminator else 0.5
    
    # forward CNN discriminator for all 3 embeddings
    pred_noCNN_cnn_discriminator = cnn_discriminator(emb_noCNN)
    pred_noRNN_cnn_discriminator = cnn_discriminator(emb_noRNN)
    pred_noCNNRNN_cnn_discriminator = cnn_discriminator(emb_noCNNRNN)
    
    d_loss_cnn_1 = torch.mean((target_cnn_01 - pred_noCNN_cnn_discriminator) ** 2)
    d_loss_cnn_2 = torch.mean((target_cnn_10 - pred_noRNN_cnn_discriminator) ** 2)
    d_loss_cnn_3 = torch.mean((target_cnn_00 - pred_noCNNRNN_cnn_discriminator) ** 2)
        
    # forward RNN discriminator for all 3 embeddings
    pred_noCNN_rnn_discriminator = rnn_discriminator(emb_noCNN)
    pred_noRNN_rnn_discriminator = rnn_discriminator(emb_noRNN)
    pred_noCNNRNN_rnn_discriminator = rnn_discriminator(emb_noCNNRNN)
    
    d_loss_rnn_1 = torch.mean((target_rnn_01 - pred_noCNN_rnn_discriminator) ** 2)
    d_loss_rnn_2 = torch.mean((target_rnn_10 - pred_noRNN_rnn_discriminator) ** 2)
    d_loss_rnn_3 = torch.mean((target_rnn_00 - pred_noCNNRNN_rnn_discriminator) ** 2)
    
    # accumulate loss
    d_loss = d_loss_cnn_1 + d_loss_cnn_2 + d_loss_cnn_3 \
            +d_loss_rnn_1 + d_loss_rnn_2 + d_loss_rnn_3
    
    return d_loss

def realfake_discriminator_zero_grad(realfake_discriminator_optimizer):
    realfake_discriminator_optimizer.zero_grad()

def realfake_discriminator_update(realfake_discriminator_optimizer):
    realfake_discriminator_optimizer.step()
    
def realfake_discriminator_forward_loss(fakes, real, realfake_discriminator_optimizer, realfake_discriminator, gradients_for_discriminator=True):
    realfake_discriminator_optimizer.zero_grad()
    
    fake_noCNN, fake_noRNN, fake_noCNNRNN = fakes
    
    pred_noCNN = realfake_discriminator(fake_noCNN)
    pred_noRNN = realfake_discriminator(fake_noRNN)
    pred_noCNNRNN = realfake_discriminator(fake_noCNNRNN)
    
    d_loss_noCNN = torch.mean((0.0 - pred_noCNN) ** 2)
    d_loss_noRNN = torch.mean((0.0 - pred_noRNN) ** 2)
    d_loss_noCNNRNN = torch.mean((0.0 - pred_noCNNRNN) ** 2)
    
    d_loss = (d_loss_noCNN + d_loss_noRNN + d_loss_noCNNRNN) / 3.0
    
    # if we are running this func for generator gradients, skip real input
    # because gradient doesn't flow to generators
    if gradients_for_discriminator: 
        pred_real = realfake_discriminator(real)
        d_loss_real = torch.mean((1.0 - pred_real) ** 2)
        d_loss = (d_loss + d_loss_real) / 2.0
    
    return d_loss

def compute_identity_loss(fakes, real):
    fake_noCNN, fake_noRNN, fake_noCNNRNN = fakes
    identityLoss_noCNN = torch.mean(torch.abs(real - fake_noCNN))
    identityLoss_noRNN = torch.mean(torch.abs(real - fake_noRNN))
    identityLoss_noCNNRNN = torch.mean(torch.abs(real - fake_noCNNRNN))
    identityLoss = (identityLoss_noCNN + identityLoss_noRNN + identityLoss_noCNNRNN) / 3.0
    return identityLoss

def compute_cycle_loss(fakes, real):
    fake_noCNN, fake_noRNN, fake_noCNNRNN = fakes
    cycleLoss_noCNN = torch.mean(torch.abs(real - fake_noCNN))
    cycleLoss_noRNN = torch.mean(torch.abs(real - fake_noRNN))
    cycleLoss_noCNNRNN = torch.mean(torch.abs(real - fake_noCNNRNN))
    cycleLoss = (cycleLoss_noCNN + cycleLoss_noRNN + cycleLoss_noCNNRNN) / 3.0
    return cycleLoss
    

In [None]:
def generate(voice, isToA=True):
    embedding = encoder_noCNNRNN(voice)    
    decoder = decoder_2A_noCNNRNN if isToA else decoder_2B_noCNNRNN
    fake = decoder(embedding, encoder_noCNNRNN.downsample2_forshape.shape)
    return fake

def validation_for_A_dir(save_dir):
    num_mcep = 24
    sampling_rate = 16000
    frame_period = 5.0
    n_frames = 128

    print("Generating Validation Data B from A...")
    for file in os.listdir(validation_A_dir):
        filePath = os.path.join(validation_A_dir, file)
        wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True)
        wav = preprocess.wav_padding(wav=wav,
                                     sr=sampling_rate,
                                     frame_period=frame_period,
                                     multiple=4)
        f0, timeaxis, sp, ap = preprocess.world_decompose(
            wav=wav, fs=sampling_rate, frame_period=frame_period)
        f0_converted = preprocess.pitch_conversion(f0=f0,
                                                   mean_log_src=log_f0s_mean_A,
                                                   std_log_src=log_f0s_std_A,
                                                   mean_log_target=log_f0s_mean_B,
                                                   std_log_target=log_f0s_std_B)
        coded_sp = preprocess.world_encode_spectral_envelop(
            sp=sp, fs=sampling_rate, dim=num_mcep)
        coded_sp_transposed = coded_sp.T
        coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
        coded_sp_norm = np.array([coded_sp_norm])

        if torch.cuda.is_available():
            coded_sp_norm = torch.from_numpy(coded_sp_norm).cuda().float()
        else:
            coded_sp_norm = torch.from_numpy(coded_sp_norm).float()

        coded_sp_converted_norm = generate(coded_sp_norm, isToA=False)
        coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach().numpy()
        coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
        coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = preprocess.world_decode_spectral_envelop(
            coded_sp=coded_sp_converted, fs=sampling_rate)
        wav_transformed = preprocess.world_speech_synthesis(f0=f0_converted,
                                                            decoded_sp=decoded_sp_converted,
                                                            ap=ap,
                                                            fs=sampling_rate,
                                                            frame_period=frame_period)
        librosa.output.write_wav(path=os.path.join(save_dir, os.path.basename(file)),
                                 y=wav_transformed,
                                 sr=sampling_rate)    

In [None]:

start_decay = 10

In [None]:
for epoch in range(start_epoch, num_epochs):
    start_time_epoch = time.time()

    # Constants
    cycle_loss_lambda = 10
    identity_loss_lambda = 5
    #if epoch>20:#
        #cycle_loss_lambda = 15#
        #identity_loss_lambda = 0#

    # Preparing Dataset
    n_samples = len(dataset_A)

    dataset = trainingDataset(datasetA=dataset_A,
                              datasetB=dataset_B,
                              n_frames=128)
    train_loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=mini_batch_size,
                                               shuffle=True,
                                               drop_last=False)

    for i, (real_A, real_B) in enumerate(train_loader):

        num_iterations = (
            n_samples // mini_batch_size) * epoch + i
        # print("iteration no: ", num_iterations, epoch)

        if num_iterations > 10000:
            identity_loss_lambda = 0
        if num_iterations > start_decay:
            for generator_optimizer in generators_optimizers:
                adjust_lr_rate(
                    generator_optimizer, name='generator')
                adjust_lr_rate(
                    generator_optimizer, name='discriminator')
        
        real_A = real_A.to(device).float()
        real_B = real_B.to(device).float()
        
        for (real1, real2, real1_is_A) in [(real_A, real_B, True), (real_B, real_A, False)]:
            real2_is_A = not real1_is_A
            if (real2_is_A):
                current_realfake_discriminator = A_realfake_discriminator
                current_realfake_discriminator_optimizer = A_realfake_discriminator_optimizer
            else:
                current_realfake_discriminator = B_realfake_discriminator
                current_realfake_discriminator_optimizer = B_realfake_discriminator_optimizer
            
            reset_grad();
            
            # ----------------------------------------------------------------
            # full forward pass and compute loss for improving generators only
            # then backward and update generators
            
            # adversarial A/B loss
            embeddings1 = encoders_forward((real1, real1, real1))
            adversarial_AB_loss = AB_discriminator_forward_loss(embeddings1, 
                                                                isA=real1_is_A, 
                                                                gradients_for_discriminator=False)
            
            # identity loss
            fakes11 = decoders_forward(embeddings1, isToA=real1_is_A)
            identity_loss = compute_identity_loss(fakes11, real1)
            
            # adversarial real/fake loss
            fakes12 = decoders_forward(embeddings1, isToA=real2_is_A)
            
            ## TODO: seperate A/B 
            # A_realfake_discriminator_optimizer
            adversarial_RealFake_loss = realfake_discriminator_forward_loss(fakes12, real2, current_realfake_discriminator_optimizer,
                                                                            current_realfake_discriminator, gradients_for_discriminator=False)
            
            # cycle consistency loss
            embeddings12 = encoders_forward(fakes12)
            fakes121 = decoders_forward(embeddings12, isToA=real1_is_A)
            cycle_loss = compute_cycle_loss(fakes121, real1)
            
            # compute total generator loss
            total_generator_loss = adversarial_AB_loss \
                                 + identity_loss \
                                 + adversarial_RealFake_loss \
                                 + cycle_loss
            
            # backward and update generators only
            total_generator_loss.backward()
            generators_update()
            
            # --------------------------------------------
            # detach embeddings and fakes12 tensors from autograd
            # then forward and backward the discriminators only on detached embeddings and fakes12 tensors
            # gradients won't flow back to encoders or decoders
            
            AB_discriminator_zero_grad()
            embeddings1 = (embeddings1[0].detach(), embeddings1[1].detach(), embeddings1[2].detach())
            d_AB_loss = AB_discriminator_forward_loss(embeddings1, 
                                                      isA=real1_is_A, 
                                                      gradients_for_discriminator=True)
            d_AB_loss.backward()
            AB_discriminator_update()
            
            realfake_discriminator_zero_grad(current_realfake_discriminator_optimizer)
            fakes12 = (fakes12[0].detach(), fakes12[1].detach(), fakes12[2].detach())    
            d_RealFake_loss = realfake_discriminator_forward_loss(fakes12, real2, current_realfake_discriminator_optimizer,
                                                                  current_realfake_discriminator, gradients_for_discriminator=True)
            d_RealFake_loss.backward()
            realfake_discriminator_update(current_realfake_discriminator_optimizer)
            
            total_discriminator_loss = d_RealFake_loss + d_AB_loss
            
            if num_iterations % 2 == 0:
                if (real2_is_A):
                    current_round = 'A'
                else:
                    current_round = 'B'
                print("Iter:{}, Real Voice:{}, Generator Loss:{:.4f} || Real Fake Discrimator Loss:{:.4f} || ABDiscrimator Loss:{:.4f}".format(
                    num_iterations, current_round, total_generator_loss.item(), d_RealFake_loss.item(), d_AB_loss.item()))

    save_dir = "./model_complex_valid/{}/".format(epoch)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    validation_for_A_dir(save_dir)

In [None]:
writer.add_histogram(name, param, n_iter)
add_histogram("gradient", , global_step=None, bins='tensorflow', walltime=None, max_bins=None)

In [None]:
encoder_noCNN.