In [None]:
import os
os.chdir('/content/drive/My Drive/GAN')

In [None]:
!pwd

/content/drive/My Drive/GAN


In [None]:
pip install librosa



In [None]:
pip install pyworld



In [None]:
import librosa
import numpy as np
import os
import pyworld
import pyworld as pw
import glob
from utility import *
import argparse

FEATURE_DIM = 36
SAMPLE_RATE = 16000
FRAMES = 512
FFTSIZE = 1024
SPEAKERS_NUM = 4  # in our experiment, we use four speakers

EPSILON = 1e-10
MODEL_NAME = 'starganvc_model'


def load_wavs(dataset: str, sr):
    '''
    data dict contains all audios file path
    resdict contains all wav files   
    '''
    data = {}
    with os.scandir(dataset) as it:
        for entry in it:
            if entry.is_dir():
                data[entry.name] = []
                # print(entry.name, entry.path)
                with os.scandir(entry.path) as it_f:
                    for onefile in it_f:
                        if onefile.is_file():
                            # print(onefile.path)
                            data[entry.name].append(onefile.path)
    print(f'loaded keys: {data.keys()}')
    # data like {TM1:[xx,xx,xxx,xxx]}
    resdict = {}

    cnt = 0
    for key, value in data.items():
        resdict[key] = {}

        for one_file in value:

            filename = os.path.normpath(one_file).split(os.sep)[-1].split('.')[0]  # like 100061
            newkey = f'{filename}'
            wav, _ = librosa.load(one_file, sr=sr, mono=True, dtype=np.float64)

            resdict[key][newkey] = wav
            # resdict[key].append(temp_dict) #like TM1:{100062:[xxxxx], .... }
            print('.', end='')
            cnt += 1

    print(f'\nTotal {cnt} aduio files!')
    return resdict


def wav_to_mcep_file(dataset: str, sr=16000, ispad: bool = False, processed_filepath: str = './data/processed'):
    '''convert wavs to mcep feature using image repr'''
    # if no processed_filepath, create it ,or delete all npz files
    if not os.path.exists(processed_filepath):
        os.makedirs(processed_filepath)
    else:
        filelist = glob.glob(os.path.join(processed_filepath, "*.npy"))
        for f in filelist:
            os.remove(f)

    allwavs_cnt = len(glob.glob(f'{dataset}/*/*.wav'))
    # allwavs_cnt = allwavs_cnt//4*3 * 12+200 #about this number not precise
    print(f'Total {allwavs_cnt} audio files!')

    d = load_wavs(dataset, sr)
    cnt = 1  #

    for one_speaker in d.keys():
        for audio_name, audio_wav in d[one_speaker].items():
            # cal source audio feature
            audio_mcep_dict = cal_mcep(
                audio_wav, fs=sr, ispad=ispad, frame_period=0.005, dim=FEATURE_DIM)
            newname = f'{one_speaker}-{audio_name}'

            # save the dict as npz
            file_path_z = f'{processed_filepath}/{newname}'
            print(f'save file: {file_path_z}')
            np.savez(file_path_z, audio_mcep_dict)

            # save every  36*FRAMES blocks
            print(f'audio mcep shape {audio_mcep_dict["coded_sp"].shape}')

            # TODO step may be FRAMES//2
            for start_idx in range(0, audio_mcep_dict["coded_sp"].shape[1] - FRAMES + 1, FRAMES):
                one_audio_seg = audio_mcep_dict["coded_sp"][:,
                                                            start_idx:start_idx + FRAMES]

                if one_audio_seg.shape[1] == FRAMES:

                    temp_name = f'{newname}_{start_idx}'
                    filePath = f'{processed_filepath}/{temp_name}'

                    print(f'[{cnt}:{allwavs_cnt}]svaing file: {filePath}.npy')
                    np.save(filePath, one_audio_seg)
            cnt += 1


def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE):
    '''cal mcep given wav singnal
        the frame_period used only for pad_wav_to_get_fixed_frames
    '''
    if ispad:
        wav, pad_length = pad_wav_to_get_fixed_frames(
            wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs)
    else:
        wav = wav_ori
    # Harvest F0 extraction algorithm.
    f0, timeaxis = pyworld.harvest(wav, fs)

    # CheapTrick harmonic spectral envelope estimation algorithm.
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)

    # D4C aperiodicity estimation algorithm.
    ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
    # feature reduction nxdim
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    # log
    coded_sp = coded_sp.T  # dim x n

    res = {
        'f0': f0,  # n
        'ap': ap,  # n*fftsize//2+1
        'sp': sp,  # n*fftsize//2+1
        'coded_sp': coded_sp,  # dim * n
    }
    return res


def pad_wav_to_get_fixed_frames(x: np.ndarray, frames: int = 128, frame_period: float = 0.005, sr: int = 16000):
    # one frame's points
    frame_length = frame_period * sr
    # frames points
    frames_points = frames * frame_length

    wav_len = len(x)

    # pad amount
    pieces = wav_len // frames_points

    need_pad = 0
    if wav_len % frames_points != 0:
        # can't devide need pad
        need_pad = int((pieces + 1) * frames_points - wav_len)

    afterpad_len = wav_len + need_pad
    # print(f'need pad: {need_pad}, after pad: {afterpad_len}')
    # padding process
    tempx = x.tolist()

    if need_pad <= len(x):
        tempx.extend(x[:need_pad])
    else:
        temp1, temp2 = need_pad // len(x), need_pad / len(x)
        tempx = tempx * (temp1 + 1)
        samll_pad_len = int(np.ceil((temp2 - temp1) * len(x)))
        tempx.extend(x[:samll_pad_len])

        diff = 0
        if afterpad_len != len(tempx):
            diff = afterpad_len - len(tempx)
        if diff > 0:
            tempx.extend(tempx[:diff])
        elif diff < 0:
            tempx = tempx[:diff]

    # print(f'padding length: {len(x)}-->length: {len(tempx)}')
    # remove last point for calculate convience:the frame length are 128*(some integer).
    tempx = tempx[:-1]

    return np.asarray(tempx, dtype=np.float), need_pad


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert the wav waveform to mel-cepstral coefficients(MCCs)\
    and calculate the speech statistical characteristics')

    input_dir = './data/fourspeakers'
    output_dir = './data/processed'
    ispad = True
    parser.add_argument('-f')
    parser.add_argument('--input_dir', type=str,
                        help='the direcotry contains data need to be processed', default=input_dir)
    parser.add_argument('--output_dir', type=str,
                        help='the directory stores the processed data', default=output_dir)
    parser.add_argument(
        '--ispad', type=bool, help='whether to pad the wavs  to get fixed length MCEP', default=ispad)

    argv = parser.parse_args()
    input_dir = argv.input_dir
    output_dir = argv.output_dir
    ispad = argv.ispad

    wav_to_mcep_file(input_dir, SAMPLE_RATE, ispad=ispad,
                     processed_filepath=output_dir)

    # input_dir is train dataset. we need to calculate and save the speech\
    # statistical characteristics for each speaker.
    generator = GenerateStatics(output_dir)
    generator.generate_stats()

Total 648 audio files!
loaded keys: dict_keys(['TM2', 'TM1', 'SF2', 'SF1'])
........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Total 648 aduio files!
save file: ./data/processed/TM2-100004
audio mcep shape (36, 512)
[1:648]svaing file: ./data/processed/TM2-100004_0.npy
save file: ./data/processed/TM2-100028
audio mcep shape (36, 512)
[2:648]svaing file: ./data/processed/TM2-100028_0.npy
save file: .

In [None]:
!pip install tensorflow==1.8.0
import tensorflow as tf
print(tf.__version__)

Collecting tensorflow==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/22/c6/d08f7c549330c2acc1b18b5c1f0f8d9d2af92f54d56861f331f372731671/tensorflow-1.8.0-cp36-cp36m-manylinux1_x86_64.whl (49.1MB)
[K     |████████████████████████████████| 49.1MB 78kB/s 
Collecting tensorboard<1.9.0,>=1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/59/a6/0ae6092b7542cfedba6b2a1c9b8dceaf278238c39484f3ba03b03f07803c/tensorboard-1.8.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 41.1MB/s 
Collecting html5lib==0.9999999
[?25l  Downloading https://files.pythonhosted.org/packages/ae/ae/bcb60402c60932b32dfaf19bb53870b29eda2cd17551ba5639219fb5ebf9/html5lib-0.9999999.tar.gz (889kB)
[K     |████████████████████████████████| 890kB 38.8MB/s 
Collecting bleach==1.5.0
  Downloading https://files.pythonhosted.org/packages/33/70/86c5fec937ea4964184d4d6c4f0b9551564f821e1c3575907639036d9b90/bleach-1.5.0-py2.py3-none-any.whl
Building wheels for collected 

2.3.0


In [None]:
import tensorflow
print(tensorflow.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.8.0


In [None]:
import tensorflow as tf


def gated_linear_layer(inputs, gates, name=None):

    activation = tf.multiply(x=inputs, y=tf.sigmoid(gates), name=name)

    return activation


def instance_norm_layer(inputs, epsilon=1e-05, activation_fn=None, name=None):

    instance_norm_layer = tf.contrib.layers.instance_norm(
        inputs=inputs, center=True, scale=True, epsilon=epsilon, activation_fn=activation_fn, scope=name)

    return instance_norm_layer


def conv1d_layer(inputs, filters, kernel_size, strides=1, padding='same', activation=None, kernel_initializer=None, name=None):

    conv_layer = tf.layers.conv1d(
        inputs=inputs,
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        padding=padding,
        activation=activation,
        kernel_initializer=kernel_initializer,
        name=name)

    return conv_layer


def conv2d_layer(inputs, filters, kernel_size, strides, padding: list = None, activation=None, kernel_initializer=None, name=None):

    p = tf.constant([[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])
    out = tf.pad(inputs, p, name=name + 'conv2d_pad')

    conv_layer = tf.layers.conv2d(
        inputs=out,
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        padding='valid',
        activation=activation,
        kernel_initializer=kernel_initializer,
        name=name)

    return conv_layer


def residual1d_block(inputs, filters=1024, kernel_size=3, strides=1, name_prefix='residule_block_'):

    h1 = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_conv')
    h1_norm = instance_norm_layer(inputs=h1, activation_fn=None, name=name_prefix + 'h1_norm')
    h1_gates = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_gates')
    h1_norm_gates = instance_norm_layer(inputs=h1_gates, activation_fn=None, name=name_prefix + 'h1_norm_gates')
    h1_glu = gated_linear_layer(inputs=h1_norm, gates=h1_norm_gates, name=name_prefix + 'h1_glu')
    h2 = conv1d_layer(inputs=h1_glu, filters=filters // 2, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h2_conv')
    h2_norm = instance_norm_layer(inputs=h2, activation_fn=None, name=name_prefix + 'h2_norm')

    h3 = inputs + h2_norm

    return h3


def downsample1d_block(inputs, filters, kernel_size, strides, name_prefix='downsample1d_block_'):

    h1 = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_conv')
    h1_norm = instance_norm_layer(inputs=h1, activation_fn=None, name=name_prefix + 'h1_norm')
    h1_gates = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_gates')
    h1_norm_gates = instance_norm_layer(inputs=h1_gates, activation_fn=None, name=name_prefix + 'h1_norm_gates')
    h1_glu = gated_linear_layer(inputs=h1_norm, gates=h1_norm_gates, name=name_prefix + 'h1_glu')

    return h1_glu


def downsample2d_block(inputs, filters, kernel_size, strides, padding: list = None, name_prefix='downsample2d_block_'):

    h1 = conv2d_layer(
        inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, activation=None, name=name_prefix + 'h1_conv')
    h1_norm = instance_norm_layer(inputs=h1, activation_fn=None, name=name_prefix + 'h1_norm')
    h1_gates = conv2d_layer(
        inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, activation=None, name=name_prefix + 'h1_gates')
    h1_norm_gates = instance_norm_layer(inputs=h1_gates, activation_fn=None, name=name_prefix + 'h1_norm_gates')
    h1_glu = gated_linear_layer(inputs=h1_norm, gates=h1_norm_gates, name=name_prefix + 'h1_glu')

    return h1_glu


def upsample1d_block(inputs, filters, kernel_size, strides, shuffle_size=2, name_prefix='upsample1d_block_'):

    h1 = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_conv')
    h1_shuffle = pixel_shuffler(inputs=h1, shuffle_size=shuffle_size, name=name_prefix + 'h1_shuffle')
    h1_norm = instance_norm_layer(inputs=h1_shuffle, activation_fn=None, name=name_prefix + 'h1_norm')

    h1_gates = conv1d_layer(inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, activation=None, name=name_prefix + 'h1_gates')
    h1_shuffle_gates = pixel_shuffler(inputs=h1_gates, shuffle_size=shuffle_size, name=name_prefix + 'h1_shuffle_gates')
    h1_norm_gates = instance_norm_layer(inputs=h1_shuffle_gates, activation_fn=None, name=name_prefix + 'h1_norm_gates')

    h1_glu = gated_linear_layer(inputs=h1_norm, gates=h1_norm_gates, name=name_prefix + 'h1_glu')

    return h1_glu


def upsample2d_block(inputs, filters, kernel_size, strides, name_prefix='upsample2d_block_'):

    # t1=tf.layers.Conv2DTranspose(filters,kernel_size,strides, padding='same',name=name_prefix+'conv1')(inputs)
    # t1 = tf.layers.batch_normalization()

    t1 = tf.keras.layers.Conv2DTranspose(filters, kernel_size, strides, padding='same')(inputs)
    # t2 = tf.keras.layers.BatchNormalization()(t1)
    t2 = tf.contrib.layers.instance_norm(t1, scope=name_prefix + 'instance1')

    x1_gates = tf.keras.layers.Conv2DTranspose(filters, kernel_size, strides, padding='same')(inputs)

    # x1_norm_gates = tf.keras.layers.BatchNormalization()(x1_gates)
    x1_norm_gates = tf.contrib.layers.instance_norm(x1_gates, scope=name_prefix + 'instance2')
    x1_glu = gated_linear_layer(t2, x1_norm_gates)

    return x1_glu


def pixel_shuffler(inputs, shuffle_size=2, name=None):

    n = tf.shape(inputs)[0]
    w = tf.shape(inputs)[1]
    c = inputs.get_shape().as_list()[2]

    oc = c // shuffle_size
    ow = w * shuffle_size

    outputs = tf.reshape(tensor=inputs, shape=[n, ow, oc], name=name)

    return outputs


def generator_gatedcnn(inputs, speaker_id=None, reuse=False, scope_name='generator_gatedcnn'):
    #input shape [batchsize, h, w, c]
    #speaker_id [batchsize, one_hot_vector]
    #one_hot_vector：[0,1,0,0]
    with tf.variable_scope(scope_name) as scope:
        if reuse:
            scope.reuse_variables()
        else:
            assert scope.reuse is False

        #downsample
        d1 = downsample2d_block(inputs, filters=32, kernel_size=[3, 9], strides=[1, 1], padding=[1, 4], name_prefix='down_1')
        print(f'd1: {d1.shape.as_list()}')

        d2 = downsample2d_block(d1, filters=64, kernel_size=[4, 8], strides=[2, 2], padding=[1, 3], name_prefix='down_2')
        print(f'd2: {d2.shape.as_list()}')

        d3 = downsample2d_block(d2, filters=128, kernel_size=[4, 8], strides=[2, 2], padding=[1, 3], name_prefix='down_3')
        print(f'd3: {d3.shape.as_list()}')

        d4 = downsample2d_block(d3, filters=64, kernel_size=[3, 5], strides=[1, 1], padding=[1, 2], name_prefix='down_4')
        print(f'd4: {d4.shape.as_list()}')
        d5 = downsample2d_block(d4, filters=5, kernel_size=[9, 5], strides=[9, 1], padding=[1, 2], name_prefix='down_5')

        #upsample
        speaker_id = tf.convert_to_tensor(speaker_id, dtype=tf.float32)
        c_cast = tf.cast(tf.reshape(speaker_id, [-1, 1, 1, speaker_id.shape.dims[-1].value]), tf.float32)
        c = tf.tile(c_cast, [1, d5.shape.dims[1].value, d5.shape.dims[2].value, 1])
        print(c.shape.as_list())
        concated = tf.concat([d5, c], axis=-1)
        # print(concated.shape.as_list())

        u1 = upsample2d_block(concated, 64, kernel_size=[9, 5], strides=[9, 1], name_prefix='gen_up_u1')
        print(f'u1.shape :{u1.shape.as_list()}')

        c1 = tf.tile(c_cast, [1, u1.shape.dims[1].value, u1.shape.dims[2].value, 1])
        print(f'c1 shape: {c1.shape}')
        u1_concat = tf.concat([u1, c1], axis=-1)
        print(f'u1_concat.shape :{u1_concat.shape.as_list()}')

        u2 = upsample2d_block(u1_concat, 128, [3, 5], [1, 1], name_prefix='gen_up_u2')
        print(f'u2.shape :{u2.shape.as_list()}')
        c2 = tf.tile(c_cast, [1, u2.shape[1], u2.shape[2], 1])
        u2_concat = tf.concat([u2, c2], axis=-1)

        u3 = upsample2d_block(u2_concat, 64, [4, 8], [2, 2], name_prefix='gen_up_u3')
        print(f'u3.shape :{u3.shape.as_list()}')
        c3 = tf.tile(c_cast, [1, u3.shape[1], u3.shape[2], 1])
        u3_concat = tf.concat([u3, c3], axis=-1)

        u4 = upsample2d_block(u3_concat, 32, [4, 8], [2, 2], name_prefix='gen_up_u4')
        print(f'u4.shape :{u4.shape.as_list()}')
        c4 = tf.tile(c_cast, [1, u4.shape[1], u4.shape[2], 1])
        u4_concat = tf.concat([u4, c4], axis=-1)
        print(f'u4_concat.shape :{u4_concat.shape.as_list()}')

        u5 = tf.layers.Conv2DTranspose(filters=1, kernel_size=[3, 9], strides=[1, 1], padding='same', name='generator_last_deconv')(u4_concat)
        print(f'u5.shape :{u5.shape.as_list()}')

        return u5


def discriminator(inputs, speaker_id, reuse=False, scope_name='discriminator'):

    # inputs has shape [batch_size, height,width, channels]

    with tf.variable_scope(scope_name) as scope:
        # Discriminator would be reused in CycleGAN
        if reuse:
            scope.reuse_variables()
        else:
            assert scope.reuse is False
        #convert data type to float32
        c_cast = tf.cast(tf.reshape(speaker_id, [-1, 1, 1, speaker_id.shape[-1]]), tf.float32)
        c = tf.tile(c_cast, [1, inputs.shape[1], inputs.shape[2], 1])

        concated = tf.concat([inputs, c], axis=-1)

        # Downsample
        d1 = downsample2d_block(
            inputs=concated, filters=32, kernel_size=[3, 9], strides=[1, 1], padding=[1, 4], name_prefix='downsample2d_dis_block1_')
        c1 = tf.tile(c_cast, [1, d1.shape[1], d1.shape[2], 1])
        d1_concat = tf.concat([d1, c1], axis=-1)

        d2 = downsample2d_block(
            inputs=d1_concat, filters=32, kernel_size=[3, 8], strides=[1, 2], padding=[1, 3], name_prefix='downsample2d_dis_block2_')
        c2 = tf.tile(c_cast, [1, d2.shape[1], d2.shape[2], 1])
        d2_concat = tf.concat([d2, c2], axis=-1)

        d3 = downsample2d_block(
            inputs=d2_concat, filters=32, kernel_size=[3, 8], strides=[1, 2], padding=[1, 3], name_prefix='downsample2d_dis_block3_')
        c3 = tf.tile(c_cast, [1, d3.shape[1], d3.shape[2], 1])
        d3_concat = tf.concat([d3, c3], axis=-1)

        d4 = downsample2d_block(
            inputs=d3_concat, filters=32, kernel_size=[3, 6], strides=[1, 2], padding=[1, 2], name_prefix='downsample2d_diss_block4_')
        c4 = tf.tile(c_cast, [1, d4.shape[1], d4.shape[2], 1])
        d4_concat = tf.concat([d4, c4], axis=-1)

        c1 = conv2d_layer(d4_concat, filters=1, kernel_size=[36, 5], strides=[36, 1], padding=[0, 1], name='discriminator-last-conv')

        c1_red = tf.reduce_mean(c1, keepdims=True)

        return c1_red


def domain_classifier(inputs, reuse=False, scope_name='classifier'):

    with tf.variable_scope(scope_name) as scope:
        if reuse:
            scope.reuse_variables()
        else:
            assert scope.reuse is False

        #   add slice input shape [batchsize, 8, 512, 1]
        #get one slice
        one_slice = inputs[:, 0:8, :, :]

        d1 = tf.layers.conv2d(one_slice, 8, kernel_size=[4, 4], padding='same', name=scope_name + '_conv2d01')
        d1_p = tf.layers.max_pooling2d(d1, [2, 2], strides=[2, 2], name=scope_name + 'p1')
        print(f'domain_classifier_d1: {d1.shape}')
        print(f'domain_classifier_d1_p: {d1_p.shape}')

        d2 = tf.layers.conv2d(d1_p, 16, [4, 4], padding='same', name=scope_name + '_conv2d02')
        d2_p = tf.layers.max_pooling2d(d2, [2, 2], strides=[2, 2], name=scope_name + 'p2')
        print(f'domain_classifier_d12: {d2.shape}')
        print(f'domain_classifier_d2_p: {d2_p.shape}')

        d3 = tf.layers.conv2d(d2_p, 32, [4, 4], padding='same', name=scope_name + '_conv2d03')
        d3_p = tf.layers.max_pooling2d(d3, [2, 2], strides=[2, 2], name=scope_name + 'p3')
        print(f'domain_classifier_d3: {d3.shape}')
        print(f'domain_classifier_d3_p: {d3_p.shape}')

        d4 = tf.layers.conv2d(d3_p, 16, [3, 4], padding='same', name=scope_name + '_conv2d04')
        d4_p = tf.layers.max_pooling2d(d4, [1, 2], strides=[1, 2], name=scope_name + 'p4')
        print(f'domain_classifier_d4: {d4.shape}')
        print(f'domain_classifier_d4_p: {d4_p.shape}')

        d5 = tf.layers.conv2d(d4_p, 4, [1, 4], padding='same', name=scope_name + '_conv2d05')
        d5_p = tf.layers.max_pooling2d(d5, [1, 2], strides=[1, 2], name=scope_name + 'p5')
        print(f'domain_classifier_d5: {d5.shape}')
        print(f'domain_classifier_d5_p: {d5_p.shape}')

        p = tf.keras.layers.GlobalAveragePooling2D()(d5_p)

        o_r = tf.reshape(p, [-1, 1, 1, p.shape.dims[1].value])
        print(f'classifier_output: {o_r.shape}')

        return o_r

In [None]:


import os
import numpy as np
import argparse
import time
import librosa
import glob
from preprocess import *
from model import *
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from utility import *


def get_files_labels(pattern: str):
    files = glob.glob(pattern)
    names = []
    for f in files:
        t = os.path.normpath(f).rsplit(os.sep, maxsplit=1)[1]  #'./data/processed/SF2-100008_11.npy'
        name = t.rsplit('.', maxsplit=1)[0]
        names.append(name)

    return files, names


def train(processed_dir: str, test_wav_dir: str):
    timestr = time.strftime("%Y-%m-%d-%H-%M", time.localtime())  #like '2018-10-10-14-47'

    all_speaker = get_speakers()
    label_enc = LabelEncoder()
    label_enc.fit(all_speaker)

    lambda_cycle = 10
    lambda_identity = 5
    lambda_classifier = 3

    generator_learning_rate = 0.0001
    generator_learning_rate_decay = generator_learning_rate / 20000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 20000
    domain_classifier_learning_rate = 0.0001
    domain_classifier_learning_rate_decay = domain_classifier_learning_rate / 20000
    #====================load data================#
    print('Loading Data...')

    files, names = get_files_labels(os.path.join(processed_dir, '*.npy'))
    assert len(files) > 0

    normlizer = Normalizer()

    exclude_dict = {}  #key that not appear in the value list.(eg. SF1:[TM1**.wav,TM2**.wav,SF2**.wav ... ])
    for s in all_speaker:
        p = os.path.join(processed_dir, '*.npy')  #'./data/processed/*.npy'
        temp = [fn for fn in glob.glob(p) if fn.find(s) == -1]
        exclude_dict[s] = temp

    print('Loading Data Done.')

    #====================create model=============#
    BATCHSIZE = 1
    model = StarGANVC(num_features=FEATURE_DIM, frames=FRAMES, batchsize=BATCHSIZE)
    #====================start train==============#
    EPOCH = 200
   # print(BATCHSIZE)
    num_samples = len(files)
    #num_samples = 10
    print(num_samples)
    print("training started..................................................................")
    for epoch in range(1, EPOCH+1, 1):
        start_time_epoch = time.time()

        files_shuffled, names_shuffled = shuffle(files, names)

        for i in range(num_samples // BATCHSIZE):
                      
            num_iterations = num_samples // BATCHSIZE * (epoch-1) + i
            #print(num_iterations)

            if num_iterations > 100000:
                domain_classifier_learning_rate = max(0, domain_classifier_learning_rate - domain_classifier_learning_rate_decay)
                generator_learning_rate = max(0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(0, discriminator_learning_rate - discriminator_learning_rate_decay)

            if discriminator_learning_rate == 0 or generator_learning_rate == 0:
                print('Early stop training.')
                break

            start = i * BATCHSIZE
            end = (i + 1) * BATCHSIZE

            if end > num_samples:
                end = num_samples

            X, X_t, y, y_t = [], [], [], []

            #get target file paths
            batchnames = names_shuffled[start:end]
            pre_targets = []
            for name in batchnames:
                name = name.split(sep='-')[0]  #SF1
                t = np.random.choice(exclude_dict[name], 1)[0]
                pre_targets.append(t)

            #one batch train data
            for one_filename, one_name, one_target in zip(files_shuffled[start:end], names_shuffled[start:end], pre_targets):

                #target name
                t = os.path.normpath(one_target).rsplit(os.sep, maxsplit=1)[1]  #'./data/processed/SF2-100008_11.npy'
                target_speaker_name = t.rsplit('.', maxsplit=1)[0].split('-')[0]

                #source name
                speaker_name = one_name.split('-')[0]  #SF1

                #shape [36,512]
                one_file = np.load(one_filename)
                one_file = normlizer.forward_process(one_file, speaker_name)

                #shape [36,512,1]
                one_file = np.reshape(one_file, [one_file.shape[0], one_file.shape[1], 1])
                X.append(one_file)

                #source label
                temp_index = label_enc.transform([speaker_name])[0]
                temp_arr_s = np.zeros([
                    len(all_speaker),
                ])
                temp_arr_s[temp_index] = 1
                y.append(temp_arr_s)

                #load target files and labels
                one_file_t = np.load(one_target)
                one_file_t = normlizer.forward_process(one_file_t, target_speaker_name)

                #[36,512,1]
                one_file_t = np.reshape(one_file_t, [one_file_t.shape[0], one_file_t.shape[1], 1])
                X_t.append(one_file_t)

                #target label
                temp_index_t = label_enc.transform([target_speaker_name])[0]
                temp_arr_t = np.zeros([
                    len(all_speaker),
                ])
                temp_arr_t[temp_index_t] = 1
                y_t.append(temp_arr_t)


            generator_loss, discriminator_loss, domain_classifier_loss = model.train(\
            input_source=X, input_target=X_t, source_label=y, \
            target_label=y_t, generator_learning_rate=generator_learning_rate,\
             discriminator_learning_rate=discriminator_learning_rate,\
            classifier_learning_rate=domain_classifier_learning_rate, \
            lambda_identity=lambda_identity, lambda_cycle=lambda_cycle,\
            lambda_classifier=lambda_classifier
            )

            if num_iterations % 10 == 0:
                print('Iteration: {:07d},Generator Loss : {:.3f}, Discriminator Loss : {:.3f}, domain_classifier_loss: {:.3f}'\
                .format(num_iterations, generator_loss, discriminator_loss, domain_classifier_loss))

        #=======================test model==========================

        file_path = os.path.join('out/', f'{epoch}_{timestr}')
        if epoch % 1 == 0:
            print('============test model============')
            #out put path
            os.makedirs(file_path, exist_ok=True)                

            tempfiles = []
            for one_speaker in all_speaker:
                p = os.path.join(test_wav_dir, f'{one_speaker}/*.wav')
                wavs = glob.glob(p)
                tempfiles.append(wavs[0])
                tempfiles.append(wavs[1])  #'./data/fourspeakers_test/200006.wav'

            for one_file in tempfiles:
                _, speaker, name = os.path.normpath(one_file).rsplit(os.sep, maxsplit=2)
                wav_, fs = librosa.load(one_file, sr=SAMPLE_RATE, mono=True, dtype=np.float64)
                wav, pad_length = pad_wav_to_get_fixed_frames(wav_, frames=FRAMES)

                f0, timeaxis = pyworld.harvest(wav, fs)
                sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=FFTSIZE)
                ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=FFTSIZE)
                coded_sp = pyworld.code_spectral_envelope(sp, fs, FEATURE_DIM)

                #one audio file to multiple slices(that's one_test_sample),every slice is an input
                one_test_sample = []
                csp_transpose = coded_sp.T  #36x512 36x128...
                for i in range(0, csp_transpose.shape[1] - FRAMES + 1, FRAMES):
                    t = csp_transpose[:, i:i + FRAMES]
                    t = normlizer.forward_process(t, speaker)
                    t = np.reshape(t, [t.shape[0], t.shape[1], 1])
                    one_test_sample.append(t)

                #target label 1->2, 2->3, 3->0, 0->1
                one_test_sample_label = np.zeros([len(one_test_sample), len(all_speaker)])
                temp_index = label_enc.transform([speaker])[0]
                temp_index = (temp_index + 2) % len(all_speaker)

                for i in range(len(one_test_sample)):
                    one_test_sample_label[i][temp_index] = 1

                #get conversion target name ,like SF1
                target_name = label_enc.inverse_transform([temp_index])[0]

                generated_results = model.test(one_test_sample, one_test_sample_label)

                reshpaped_res = []
                for one in generated_results:
                    t = np.reshape(one, [one.shape[0], one.shape[1]])
                    t = normlizer.backward_process(t, target_name)
                    reshpaped_res.append(t)
                #collect the generated slices, and concate the array to be a whole representation of the whole audio
                c = []
                for one_slice in reshpaped_res:
                    one_slice = np.ascontiguousarray(one_slice.T, dtype=np.float64)
                    decoded_sp = pyworld.decode_spectral_envelope(one_slice, SAMPLE_RATE, fft_size=FFTSIZE)
                    c.append(decoded_sp)

                concated = np.concatenate((c), axis=0)

                #f0 convert
                f0 = normlizer.pitch_conversion(f0, speaker, target_name)
                synwav = pyworld.synthesize(f0, concated, ap, fs)
                #remove synthesized wav paded length
                synwav = synwav[:-pad_length]

                #save synthesized wav to file
                wavname = f'{speaker}-{target_name}+{name}'
                wavpath = os.path.join(file_path, 'wavs')
                if not os.path.exists(wavpath):
                    os.makedirs(wavpath, exist_ok=True)
                librosa.output.write_wav(f'{wavpath}/{wavname}', synwav, sr=fs)
                print(f'[save]:{wavpath}/{wavname}')

            print('============test finished!============')

        if epoch % 1 == 0:
            print('============save model============')
            model_path = os.path.join(file_path, 'model')
            os.makedirs(model_path, exist_ok=True)
            print(f'[save]: {model_path}')
            model.save(directory=model_path, filename=MODEL_NAME)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for Epoch %d: %02d:%02d:%02d' % (epoch, time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
                                                               (time_elapsed_epoch % 60 // 1)))


if __name__ == '__main__':

    processed_dir = './data/processed'
    test_wav_dir = './data/fourspeakers_test'

    parser = argparse.ArgumentParser(description='Train StarGAN Voice conversion model.')

    parser.add_argument('--processed_dir', type=str, help='train dataset directory that contains processed npy and npz files', default=processed_dir)
    parser.add_argument('--test_wav_dir', type=str, help='test directory that contains raw audios', default=test_wav_dir)
    parser.add_argument('-f')
    argv = parser.parse_args()

    processed_dir = argv.processed_dir
    test_wav_dir = argv.test_wav_dir

    start_time = time.time()

    train(processed_dir, test_wav_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time

    print('Training Time: %02d:%02d:%02d' % \
    (time_elapsed // 3600, (time_elapsed % 3600 // 60), (time_elapsed % 60 // 1)))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Loading Data...
found stat file: ./etc/TM2-stats.npz
found stat file: ./etc/TM1-stats.npz
found stat file: ./etc/SF2-stats.npz
found stat file: ./etc/SF1-stats.npz
Loading Data Done.
d1: [None, 36, 512, 32]
d2: [None, 18, 256, 64]
d3: [None, 9, 128, 128]
d4: [None, 9, 128, 64]
[None, 1, 128, 4]
u1.shape :[None, 9, 128, 64]
c1 shape: (?, 9, 128, 4)
u1_concat.shape :[None, 9, 128, 68]
u2.shape :[None, 9, 128, 128]
u3.shape :[None, 18, 256, 64]
u4.shape :[None, 36, 512, 32]
u4_concat.shape :[None, 36, 512, 36]
u5.shape :[None, 36, 512, 1]
d1: [None, 36, 512, 32]
d2: [None, 18, 256, 64]
d3: [None, 9, 128, 128]
d4: [None, 9, 128, 64]
[None, 1, 128, 4]
u1.shape :[None, 9, 128, 64]
c1 shape: (?, 9, 128, 4)
u1_concat.shape :[None, 9, 128, 68]
u2.shape :[None, 9, 128, 128]
u3.shape :[None, 18, 256, 64]
u4.shape :[None, 36, 512, 32]
u4_concat.shape :[None, 36, 512, 36]
u5.shape :[None, 36, 512, 1]
domain_classifier_d1: (?, 8, 512, 8)
domain_classifier_d1_p: (?, 4, 256, 8)
domain_classifier_d12: 

In [None]:


import argparse
import os
import numpy as np

from model import StarGANVC
from preprocess import *
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from utility import *

#get all speaker
all_speaker = get_speakers(trainset='./data/fourspeakers')
label_enc = LabelEncoder()
label_enc.fit(all_speaker)


def conversion(model_dir, test_dir, output_dir, source, target):
    if not os.path.exists(model_dir) or not os.path.exists(test_dir):
        raise Exception('model dir or test dir not exist!')
    model = StarGANVC(num_features=FEATURE_DIM, mode='test')

    model.load(filepath=os.path.join(model_dir, MODEL_NAME))
    #f'./data/fourspeakers_test/{source}/*.wav'
    p = os.path.join(test_dir, f'{source}/*.wav')
    tempfiles = glob.glob(p)

    normlizer = Normalizer()

    for one_file in tempfiles:
        _, speaker, name = os.path.normpath(one_file).rsplit(os.sep, maxsplit=2)
        # print(speaker, name)
        wav_, fs = librosa.load(one_file, sr=SAMPLE_RATE, mono=True, dtype=np.float64)
        wav, pad_length = pad_wav_to_get_fixed_frames(wav_, frames=FRAMES)

        f0, timeaxis = pyworld.harvest(wav, fs, f0_floor=71.0, f0_ceil=500.0)

        #CheapTrick harmonic spectral envelope estimation algorithm.
        sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=FFTSIZE)

        #D4C aperiodicity estimation algorithm.
        ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=FFTSIZE)
        #feature reduction
        coded_sp = pyworld.code_spectral_envelope(sp, fs, FEATURE_DIM)

        coded_sps_mean = np.mean(coded_sp, axis=0, dtype=np.float64, keepdims=True)
        coded_sps_std = np.std(coded_sp, axis=0, dtype=np.float64, keepdims=True)
        #normalize
        # coded_sp = (coded_sp - coded_sps_mean) / coded_sps_std
        # print(coded_sp.shape, f0.shape, ap.shape)

        #one audio file to multiple slices(that's one_test_sample),every slice is an input
        one_test_sample = []
        csp_transpose = coded_sp.T  #36x512 36x128...
        for i in range(0, csp_transpose.shape[1] - FRAMES + 1, FRAMES):
            t = csp_transpose[:, i:i + FRAMES]
            #normalize t
            t = normlizer.forward_process(t, speaker)
            t = np.reshape(t, [t.shape[0], t.shape[1], 1])
            one_test_sample.append(t)
        # print(f'{len(one_test_sample)} slices appended!')

        #generate target label (one-hot vector)
        one_test_sample_label = np.zeros([len(one_test_sample), len(all_speaker)])
        temp_index = label_enc.transform([target])[0]
        one_test_sample_label[:, temp_index] = 1

        generated_results = model.test(one_test_sample, one_test_sample_label)

        reshpaped_res = []
        for one in generated_results:
            t = np.reshape(one, [one.shape[0], one.shape[1]])

            t = normlizer.backward_process(t, target)
            reshpaped_res.append(t)
        #collect the generated slices, and concate the array to be a whole representation of the whole audio
        c = []
        for one_slice in reshpaped_res:
            one_slice = np.ascontiguousarray(one_slice.T, dtype=np.float64)
            # one_slice = one_slice * coded_sps_std + coded_sps_mean

            # print(f'one_slice : {one_slice.shape}')
            decoded_sp = pyworld.decode_spectral_envelope(one_slice, SAMPLE_RATE, fft_size=FFTSIZE)
            # print(f'decoded_sp shape: {decoded_sp.shape}')
            c.append(decoded_sp)

        concated = np.concatenate((c), axis=0)
        # print(f'concated shape: {concated.shape}')
        #f0 convert
        f0 = normlizer.pitch_conversion(f0, speaker, target)

        synwav = pyworld.synthesize(f0, concated, ap, fs)
        # print(f'origin wav:{len(wav_)} paded wav:{len(wav)} synthesize wav:{len(synwav)}')

        #remove synthesized wav paded length
        synwav = synwav[:-pad_length]

        #save synthesized wav to file
        wavname = f'{speaker}-{target}+{name}'
        wavpath = f'{output_dir}/wavs'
        if not os.path.exists(wavpath):
            os.makedirs(wavpath, exist_ok=True)
        librosa.output.write_wav(f'{wavpath}/{wavname}', synwav, sr=fs)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Convert voices using pre-trained CycleGAN model.')

    model_dir = './out/7_2020-10-29-05-37/model/'
    test_dir = './data/fourspeakers_test/'
    source_speaker = 'SF1'
    target_speaker = 'TM1'
    output_dir = '.\converted_voices'

    parser.add_argument('--model_dir', type=str, help='Directory for the pre-trained model.', default=model_dir)
    parser.add_argument('--test_dir', type=str, help='Directory for the voices for conversion.', default=test_dir)
    parser.add_argument('--output_dir', type=str, help='Directory for the converted voices.', default=output_dir)
    parser.add_argument('--source_speaker', type=str, help='source_speaker', default=source_speaker)
    parser.add_argument('--target_speaker', type=str, help='target_speaker', default=target_speaker)
    parser.add_argument('-f')
    argv = parser.parse_args()

    model_dir = argv.model_dir
    test_dir = argv.test_dir
    output_dir = argv.output_dir
    source_speaker = argv.source_speaker
    target_speaker = argv.target_speaker

    conversion(model_dir = model_dir,\
     test_dir = test_dir, output_dir = output_dir, source=source_speaker, target=target_speaker)

d1: [None, 36, 512, 32]
d2: [None, 18, 256, 64]
d3: [None, 9, 128, 128]
d4: [None, 9, 128, 64]
[None, 1, 128, 4]
u1.shape :[None, 9, 128, 64]
c1 shape: (?, 9, 128, 4)
u1_concat.shape :[None, 9, 128, 68]
u2.shape :[None, 9, 128, 128]
u3.shape :[None, 18, 256, 64]
u4.shape :[None, 36, 512, 32]
u4_concat.shape :[None, 36, 512, 36]
u5.shape :[None, 36, 512, 1]
d1: [None, 36, 512, 32]
d2: [None, 18, 256, 64]
d3: [None, 9, 128, 128]
d4: [None, 9, 128, 64]
[None, 1, 128, 4]
u1.shape :[None, 9, 128, 64]
c1 shape: (?, 9, 128, 4)
u1_concat.shape :[None, 9, 128, 68]
u2.shape :[None, 9, 128, 128]
u3.shape :[None, 18, 256, 64]
u4.shape :[None, 36, 512, 32]
u4_concat.shape :[None, 36, 512, 36]
u5.shape :[None, 36, 512, 1]
domain_classifier_d1: (?, 8, 512, 8)
domain_classifier_d1_p: (?, 4, 256, 8)
domain_classifier_d12: (?, 4, 256, 16)
domain_classifier_d2_p: (?, 2, 128, 16)
domain_classifier_d3: (?, 2, 128, 32)
domain_classifier_d3_p: (?, 1, 64, 32)
domain_classifier_d4: (?, 1, 64, 16)
domain_classi