In [1]:
# -*- coding: utf-8 -*-

"""
EVCで変換する.
詳細 : https://pdfs.semanticscholar.org/cbfe/71798ded05fb8bf8674580aabf534c4dbb8bc.pdf

Converting by EVC.
Check detail : https://pdfs.semanticscholar.org/cbfe/71798ded05fb8bf8674580abf534c4dbb8bc.pdf
"""

'\nEVCで変換する.\n詳細 : https://pdfs.semanticscholar.org/cbfe/71798ded05fb8bf8674580aabf534c4dbb8bc.pdf\n\nConverting by EVC.\nCheck detail : https://pdfs.semanticscholar.org/cbfe/71798ded05fb8bf8674580abf534c4dbb8bc.pdf\n'

In [2]:
from __future__ import division, print_function

import os
from shutil import rmtree
import argparse
import glob
import pickle
import time

import numpy as np
from numpy.linalg import norm 
from sklearn.decomposition import PCA
from sklearn.mixture import GMM # sklearn 0.20.0から使えない
from sklearn.preprocessing import StandardScaler
import scipy.signal
import scipy.sparse

%matplotlib inline
import matplotlib.pyplot as plt
import IPython 
from IPython.display import Audio 

import soundfile as sf
import wave 
import pyworld as pw
import librosa.display

from dtw import dtw
import warnings
warnings.filterwarnings('ignore')

In [3]:
"""
Parameters

__Mixtured : GMM混合数
__versions : 実験セット
__convert_source : 変換元話者のパス
__convert_target : 変換先話者のパス
"""
# parameters 
__Mixtured = 40
__versions = 'pre-stored0.1.3'
__convert_source = 'input/EJM10/V01/T01/TIMIT/000/*.wav' 
__convert_target = 'adaptation/EJF01/V01/T01/ATR503/A/*.wav'

# settings
__same_path = './utterance/' + __versions + '/'
__output_path = __same_path + 'output/EJF01/' # EJF01, EJF07, EJM04, EJM05

Mixtured = __Mixtured
pre_stored_pickle = __same_path + __versions + '.pickle'
pre_stored_source_list = __same_path + 'pre-source/**/V01/T01/**/*.wav'
pre_stored_list = __same_path + "pre/**/V01/T01/**/*.wav"
#pre_stored_target_list = "" (not yet)
pre_stored_gmm_init_pickle = __same_path + __versions + '_init-gmm.pickle'
pre_stored_sv_npy = __same_path + __versions + '_sv.npy'

save_for_evgmm_covarXX = __output_path + __versions + '_covarXX.npy'
save_for_evgmm_covarYX = __output_path + __versions + '_covarYX.npy'
save_for_evgmm_fitted_source = __output_path + __versions + '_fitted_source.npy'
save_for_evgmm_fitted_target = __output_path + __versions + '_fitted_target.npy'
save_for_evgmm_weights = __output_path + __versions + '_weights.npy'
save_for_evgmm_source_means = __output_path + __versions + '_source_means.npy'

for_convert_source = __same_path + __convert_source
for_convert_target = __same_path + __convert_target
converted_voice_npy = __output_path + 'sp_converted_' + __versions  
converted_voice_wav = __output_path + 'sp_converted_' + __versions 
mfcc_save_fig_png = __output_path + 'mfcc3dim_' + __versions 
f0_save_fig_png = __output_path + 'f0_converted' + __versions
converted_voice_with_f0_wav = __output_path + 'sp_f0_converted' + __versions

__measure_target = 'adaptation/EJF01/V01/T01/TIMIT/000/*.wav'
for_measure_target = __same_path + __measure_target
mcd_text = __output_path + __versions + '_MCD.txt'


In [4]:
EPSILON = 1e-8

class MFCC:
    """
    MFCC() : メル周波数ケプストラム係数(MFCC)を求めたり、MFCCからスペクトルに変換したりするクラス.
    動的特徴量(delta)が実装途中.
    ref : http://aidiary.hatenablog.com/entry/20120225/1330179868
    """
    
    
    def __init__(self, frequency, nfft=1026, dimension=24, channels=24):
        """
        各種パラメータのセット
        nfft : FFTのサンプル点数
        frequency : サンプリング周波数
        dimension : MFCC次元数
        channles : メルフィルタバンクのチャンネル数(dimensionに依存)
        fscale : 周波数スケール軸
        filterbankl, fcenters : フィルタバンク行列, フィルタバンクの頂点(?)
        """
        self.nfft = nfft
        self.frequency = frequency
        self.dimension = dimension
        self.channels = channels
        self.fscale = np.fft.fftfreq(self.nfft, d = 1.0 / self.frequency)[: int(self.nfft / 2)]
        self.filterbank, self.fcenters = self.melFilterBank()
    
    def hz2mel(self, f):
        """
        周波数からメル周波数に変換
        """
        return 1127.01048 * np.log(f / 700.0 + 1.0)
    
    def mel2hz(self, m):
        """
        メル周波数から周波数に変換
        """     
        return 700.0 * (np.exp(m / 1127.01048) - 1.0)

    def melFilterBank(self):
        """
        メルフィルタバンクを生成する
        """      
        fmax = self.frequency / 2
        melmax = self.hz2mel(fmax)
        nmax = int(self.nfft / 2)
        df = self.frequency / self.nfft
        dmel = melmax / (self.channels + 1)
        melcenters = np.arange(1, self.channels + 1) * dmel
        fcenters = self.mel2hz(melcenters)
        indexcenter = np.round(fcenters / df)
        indexstart = np.hstack(([0], indexcenter[0:self.channels - 1]))
        indexstop = np.hstack((indexcenter[1:self.channels], [nmax]))

        filterbank = np.zeros((self.channels, nmax))
        for c in np.arange(0, self.channels):
            increment = 1.0 / (indexcenter[c] - indexstart[c])
            # np,int_ は np.arangeが[0. 1. 2. ..]となるのをintにする
            for i in np.int_(np.arange(indexstart[c], indexcenter[c])):
                filterbank[c, i] = (i - indexstart[c]) * increment
            decrement = 1.0 / (indexstop[c] - indexcenter[c])
            # np,int_ は np.arangeが[0. 1. 2. ..]となるのをintにする
            for i in np.int_(np.arange(indexcenter[c], indexstop[c])):
                filterbank[c, i] = 1.0 - ((i - indexcenter[c]) * decrement)

        return filterbank, fcenters
    
    def mfcc(self, spectrum):
        """
        スペクトルからMFCCを求める.
        """
        mspec = []
        mspec = np.log10(np.dot(spectrum, self.filterbank.T))
        mspec = np.array(mspec)
        
        return scipy.fftpack.realtransforms.dct(mspec, type=2, norm="ortho", axis=-1)
    
    def delta(self, mfcc):
        """
        MFCCから動的特徴量を求める.
        現在は,求める特徴量フレームtをt-1とt+1の平均としている.
        """
        mfcc = np.concatenate([
            [mfcc[0]], 
            mfcc, 
            [mfcc[-1]]
        ]) # 最初のフレームを最初に、最後のフレームを最後に付け足す
        delta = None
        for i in range(1, mfcc.shape[0] - 1):
            slope = (mfcc[i+1] - mfcc[i-1]) / 2
            if delta is None:
                delta = slope
            else:
                delta = np.vstack([delta, slope])
        return delta
    
    def imfcc(self, mfcc, spectrogram):
        """
        MFCCからスペクトルを求める.
        """
        im_sp = np.array([])
        for i in range(mfcc.shape[0]):
            mfcc_s = np.hstack([mfcc[i], [0] * (self.channels - self.dimension)])
            mspectrum = scipy.fftpack.idct(mfcc_s, norm='ortho')
            # splrep はスプライン補間のための補間関数を求める
            tck = scipy.interpolate.splrep(self.fcenters, np.power(10, mspectrum))
            # splev は指定座標での補間値を求める
            im_spectrogram = scipy.interpolate.splev(self.fscale, tck)
            im_sp = np.concatenate((im_sp, im_spectrogram), axis=0)
            
        return im_sp.reshape(spectrogram.shape)
            
    def trim_zeros_frames(x, eps=1e-7):
        """
        無音区間を取り除く.
        """
        T, D = x.shape
        s = np.sum(np.abs(x), axis=1)
        s[s < 1e-7] = 0.
        return x[s > eps]

In [5]:
def analyse_by_world_with_harverst(x, fs):
    """
    WORLD音声分析合成器で基本周波数F0,スペクトル包絡,非周期成分を求める.
    基本周波数F0についてはharvest法により,より精度良く求める.
    """
    # 4 Harvest with F0 refinement (using Stonemask)
    frame_period = 5
    _f0_h, t_h = pw.harvest(x, fs, frame_period=frame_period)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    
    return f0_h, sp_h, ap_h

def wavread(file):
    """
    wavファイルから音声トラックとサンプリング周波数を抽出する.
    """
    wf = wave.open(file, "r")
    fs = wf.getframerate()
    x = wf.readframes(wf.getnframes())
    x = np.frombuffer(x, dtype= "int16") / 32768.0
    wf.close()
    return x, float(fs)

def preEmphasis(signal, p=0.97):
    """
    MFCC抽出のための高域強調フィルタ.
    波形を通すことで,高域成分が強調される.
    """
    return scipy.signal.lfilter([1.0, -p], 1, signal)

def alignment(source, target, path):
    """
    タイムアライメントを取る.
    target音声をsource音声の長さに合うように調整する.
    """
    # ここでは814に合わせよう(targetに合わせる)
    # p_p = 0 if source.shape[0] > target.shape[0] else 1

    #shapes = source.shape if source.shape[0] > target.shape[0] else target.shape 
    shapes = source.shape
    align = np.array([])
    for (i, p) in enumerate(path[0]):
        if i != 0:
            if j != p:
                temp = np.array(target[path[1][i]])
                align = np.concatenate((align, temp), axis=0)
        else:
            temp = np.array(target[path[1][i]])
            align = np.concatenate((align, temp), axis=0)   
        
        j = p
        
    return align.reshape(shapes)

In [6]:
covarXX = np.load(save_for_evgmm_covarXX)
covarYX = np.load(save_for_evgmm_covarYX)
fitted_source = np.load(save_for_evgmm_fitted_source)
fitted_target = np.load(save_for_evgmm_fitted_target)
weights = np.load(save_for_evgmm_weights)
source_means = np.load(save_for_evgmm_source_means)

In [7]:
"""
声質変換に用いる変換元音声と目標音声を読み込む.
"""

timer_start = time.time()
source_mfcc_for_convert = []
source_sp_for_convert = []
source_f0_for_convert = []
source_ap_for_convert = []
fs_source = None
for name in sorted(glob.iglob(for_convert_source, recursive=True)):
    print("source = ", name)
    x_source, fs_source = sf.read(name)
    f0_source, sp_source, ap_source = analyse_by_world_with_harverst(x_source, fs_source)
    mfcc_source = MFCC(fs_source)
    #mfcc_s_tmp = mfcc_s.mfcc(sp)
    #source_mfcc_for_convert = np.hstack([mfcc_s_tmp, mfcc_s.delta(mfcc_s_tmp)])
    source_mfcc_for_convert.append(mfcc_source.mfcc(sp_source))
    source_sp_for_convert.append(sp_source)
    source_f0_for_convert.append(f0_source)
    source_ap_for_convert.append(ap_source)

target_mfcc_for_fit = []
target_f0_for_fit = []
target_ap_for_fit = []
for name in sorted(glob.iglob(for_convert_target, recursive=True)):
    print("target = ", name)
    x_target, fs_target = sf.read(name)
    f0_target, sp_target, ap_target = analyse_by_world_with_harverst(x_target, fs_target)
    mfcc_target = MFCC(fs_target)
    #mfcc_target_tmp = mfcc_target.mfcc(sp_target)
    #target_mfcc_for_fit = np.hstack([mfcc_t_tmp, mfcc_t.delta(mfcc_t_tmp)])
    target_mfcc_for_fit.append(mfcc_target.mfcc(sp_target))
    target_f0_for_fit.append(f0_target)
    target_ap_for_fit.append(ap_target)

# 全部numpy.arrrayにしておく
source_data_mfcc = np.array(source_mfcc_for_convert)
source_data_sp = np.array(source_sp_for_convert)
source_data_f0 = np.array(source_f0_for_convert)
source_data_ap = np.array(source_ap_for_convert)

target_mfcc = np.array(target_mfcc_for_fit)
target_f0 = np.array(target_f0_for_fit)
target_ap = np.array(target_ap_for_fit)

print("Load Input and Target Voice time = ", time.time() - timer_start , "[sec]")  

source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A11.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A14.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A17.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A18.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A19.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A20.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A21.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A22.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A23.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A24.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A25.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A26.wav
source =  ./utterance/pre-stored0.1.3/input/EJM10/V01/T01/TIMIT/000/A27.wav
source =  ./

In [8]:
def convert(source, covarXX, fitted_source, fitted_target, covarYX, weights, source_means):
    """
    声質変換を行う.
    """
    Mixtured = 40
    
    D = source.shape[0]
    E = np.zeros((Mixtured, D))

    for m in range(Mixtured):
        xx = np.linalg.solve(covarXX[m], source - fitted_source[m])
        E[m] = fitted_target[m] + np.dot(covarYX[m], xx)

    px = GMM(n_components = Mixtured, covariance_type = 'full')
    px.weights_ = weights
    px.means_ = source_means
    px.covars_ = covarXX

    posterior = px.predict_proba(np.atleast_2d(source))
    return np.dot(posterior, E)

In [9]:
def calc_std_mean(input_f0):
    """
    F0変換のために標準偏差と平均を求める.
    """
    tempF0 = input_f0[ np.where(input_f0 > 0)]
    fixed_logF0 = np.log(tempF0)
    #logF0 = np.ma.log(input_f0) # 0要素にlogをするとinfになるのでmaskする
    #fixed_logF0 = np.ma.fix_invalid(logF0).data # maskを取る
    return np.std(fixed_logF0), np.mean(fixed_logF0) # 標準偏差と平均を返す

In [10]:
"""
距離を測るために,正しい目標音声を読み込む
"""
source_mfcc_for_measure_target = []
source_sp_for_measure_target = []
source_f0_for_measure_target = []
source_ap_for_measure_target = []
for name in sorted(glob.iglob(for_measure_target, recursive=True)):
    print("measure_target = ", name)
    x_measure_target, fs_measure_target = sf.read(name)
    f0_measure_target, sp_measure_target, ap_measure_target = analyse_by_world_with_harverst(x_measure_target, fs_measure_target)
    mfcc_measure_target = MFCC(fs_measure_target)
    #mfcc_s_tmp = mfcc_s.mfcc(sp)
    #source_mfcc_for_convert = np.hstack([mfcc_s_tmp, mfcc_s.delta(mfcc_s_tmp)])
    source_mfcc_for_measure_target.append(mfcc_measure_target.mfcc(sp_measure_target))
    source_sp_for_measure_target.append(sp_measure_target)
    source_f0_for_measure_target.append(f0_measure_target)
    source_ap_for_measure_target.append(ap_measure_target)
    
measure_target_data_mfcc = np.array(source_mfcc_for_measure_target)
measure_target_data_sp = np.array(source_sp_for_measure_target)
measure_target_data_f0 = np.array(source_f0_for_measure_target)
measure_target_data_ap = np.array(source_ap_for_measure_target)

measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A11.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A14.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A17.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A18.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A19.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A20.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A21.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A22.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A23.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A24.wav
measure_target =  ./utterance/pre-stored0.1.3/adaptation/EJF01/V01/T01/TIMIT/000/A25.wav
measure_target =  ./u

In [11]:
def calc_mcd(source, convert, target):
    """
    変換する前の音声と目標音声でDTWを行う.
    その後,変換後の音声と目標音声とのMCDを計測する.
    """
    dist, cost, acc, path = dtw(source, target, dist=lambda x, y: norm(x-y, ord=1))
    aligned = alignment(source, target, path)
    
    return 10.0 / np.log(10) * np.sqrt(2 * np.sum(np.square(aligned - convert))), aligned


In [12]:
"""
変換を行う.
"""

timer_start = time.time()

# 事前に目標話者の標準偏差と平均を求めておく
temp_f = None
for x in range(len(target_f0)):
    temp = target_f0[x].flatten()
    if temp_f is None:
        temp_f = temp
    else:
        temp_f = np.hstack((temp_f, temp)) 
target_std, target_mean = calc_std_mean(temp_f)

# 変換
output_mfcc = []
filer = open(mcd_text, 'a')
for i in range(len(source_data_mfcc)):   
    print("voice no = ", i)
    # convert
    source_temp = source_data_mfcc[i]
    output_mfcc = np.array([convert(source_temp[frame], covarXX, fitted_source, fitted_target, covarYX, weights, source_means)[0] for frame in range(source_temp.shape[0])])
    
    # syntehsis
    source_sp_temp = source_data_sp[i]
    source_f0_temp = source_data_f0[i]
    source_ap_temp = source_data_ap[i]
    output_imfcc = mfcc_source.imfcc(output_mfcc, source_sp_temp)
    y_source = pw.synthesize(source_f0_temp, output_imfcc, source_ap_temp, fs_source, 5)
    np.save(converted_voice_npy + "s{0}.npy".format(i), output_imfcc)
    sf.write(converted_voice_wav + "s{0}.wav".format(i), y_source, fs_source)
    
    # calc MCD
    measure_temp = measure_target_data_mfcc[i]
    mcd, aligned_measure = calc_mcd(source_temp, output_mfcc, measure_temp)
    filer.write("MCD No.{0} = {1} , shape = {2}\n".format(i, mcd, source_temp.shape))
    
    # save figure spectram
    range_s = output_imfcc.shape[0]
    scale = [x for x in range(range_s)]
    MFCC_sample_s = [source_temp[x][0] for x in range(range_s)]
    MFCC_sample_c = [output_mfcc[x][0] for x in range(range_s)]
    MFCC_sample_t = [aligned_measure[x][0] for x in range(range_s)]
    
    plt.subplot(311)
    plt.plot(scale, MFCC_sample_s, label="source", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_c, label="convert", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_t, label="target", linewidth = 1.0, linestyle="dashed")
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
    #plt.xlabel("Flame")
    #plt.ylabel("amplitude MFCC")
    
    MFCC_sample_s = [source_temp[x][1] for x in range(range_s)]
    MFCC_sample_c = [output_mfcc[x][1] for x in range(range_s)]
    MFCC_sample_t = [aligned_measure[x][1] for x in range(range_s)]
    
    plt.subplot(312)
    plt.plot(scale, MFCC_sample_s, label="source", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_c, label="convert", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_t, label="target", linewidth = 1.0, linestyle="dashed")
    plt.ylabel("amplitude MFCC")
    
    MFCC_sample_s = [source_temp[x][2] for x in range(range_s)]
    MFCC_sample_c = [output_mfcc[x][2] for x in range(range_s)]
    MFCC_sample_t = [aligned_measure[x][2] for x in range(range_s)]
    
    plt.subplot(313)
    plt.plot(scale, MFCC_sample_s, label="source", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_c, label="convert", linewidth = 1.0)
    plt.plot(scale, MFCC_sample_t, label="target", linewidth = 1.0, linestyle="dashed")
    plt.xlabel("Flame")

    plt.savefig(mfcc_save_fig_png + "s{0}.png".format(i) , format='png', dpi=300)
    plt.close()
    
    # synthesis with conveted f0
    source_std, source_mean = calc_std_mean(source_f0_temp)
    std_ratio = target_std / source_std
    log_conv_f0 = std_ratio * (source_f0_temp - source_mean) + target_mean
    conv_f0 = np.maximum(log_conv_f0, 0)
    np.save(converted_voice_npy + "f{0}.npy".format(i), conv_f0)
    
    y_conv = pw.synthesize(conv_f0, output_imfcc, source_ap_temp, fs_source, 5)
    sf.write(converted_voice_with_f0_wav + "sf{0}.wav".format(i) , y_conv, fs_source)
    
    # save figure f0
    F0_s = [source_f0_temp[x] for x in range(range_s)]
    F0_c = [conv_f0[x] for x in range(range_s)]
    
    plt.plot(scale, F0_s, label="source", linewidth = 1.0)
    plt.plot(scale, F0_c, label="convert", linewidth = 1.0)
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
    plt.xlabel("Frame")
    plt.ylabel("Amplitude")
    
    plt.savefig(f0_save_fig_png + "f{0}.png".format(i), format='png', dpi=300)
    plt.close()
    
filer.close()
print("Make Converted Spectram time = ", time.time() - timer_start , "[sec]")

voice no =  0
voice no =  1
voice no =  2
voice no =  3
voice no =  4
voice no =  5
voice no =  6
voice no =  7
voice no =  8
voice no =  9
voice no =  10
voice no =  11
voice no =  12
voice no =  13
voice no =  14
voice no =  15
voice no =  16
voice no =  17
voice no =  18
voice no =  19
voice no =  20
voice no =  21
voice no =  22
voice no =  23
voice no =  24
voice no =  25
voice no =  26
voice no =  27
voice no =  28
voice no =  29
voice no =  30
voice no =  31
voice no =  32
voice no =  33
voice no =  34
voice no =  35
voice no =  36
voice no =  37
voice no =  38
voice no =  39
voice no =  40
voice no =  41
voice no =  42
voice no =  43
voice no =  44
Make Converted Spectram time =  612.688894033432 [sec]
