In [1]:
# -*- coding: utf-8 -*-

"""
This program makes learning ev-gmm.
"""

'\nThis program makes learning ev-gmm.\n'

In [2]:
# __future__ module make compatible python2 and python3
from __future__ import division, print_function

# basic modules
import os
import os.path
import time

# for warning ignore
import warnings
#warning.filterwarnings('ignore')

# for file system manupulation 
from shutil import rmtree 
import glob
import argparse

# for save object
import pickle

# for make glaph
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (16, 5)
import librosa.display

# for scientific computing
import numpy as np
from numpy.linalg import norm 
from sklearn.decomposition import PCA
#from sklearn.mixture import GMM # GMM class cannot use after sklearn 0.20.0
import sklearn.mixture
from sklearn.mixture.gaussian_mixture import _compute_precision_cholesky
from sklearn.preprocessing import StandardScaler
import scipy.sparse
from scipy.signal import firwin, lfilter

# for display audio controler
from IPython.display import Audio

# for manuplate audio data
import soundfile as sf
import pyworld as pw
import pysptk
from dtw import dtw
from fastdtw import fastdtw

In [3]:
class WORLD(object):
    """
    WORLD based speech analyzer and synthezer.
    
    Ref : https://github.com/k2kobayashi/sprocket/
    """
    def __init__(self, fs=16000, fftl=1024, shiftms=5.0, minf0=40.0, maxf0=500.0):
        """
        Parameters
        ----------
        fs : int 
            Sampling frequency 
        fftl : int
            FFT length
        shiftms : float
            Shift length [ms]
        minf0 : float
            Floor in F0 estimation
        maxf0 : float
            Seli in F0 estimation
        """
        
        self.fs = fs
        self.fftl = fftl
        self.shiftms = shiftms
        self.minf0 = minf0
        self.maxf0 = maxf0
        
    def analyze(self, x):
        """
        Analyze acoustic featueres.
        
        Parameters
        ----------
        x : array, shape(`T`)
            monoral speech signal in time domain
        
        Returns
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        sp : array, shape(`T`, `fftl / 2 + 1`)
            Spectral envelope sequence
        ap : array, shape(`T`, `fftl / 2 + 1`)
            aperiodicity sequence
        """
        
        f0, time_axis = pw.harvest(x, self.fs, f0_floor=self.minf0,
                                   f0_ceil=self.maxf0, frame_period=self.shiftms)
        sp = pw.cheaptrick(x, f0, time_axis, self.fs, fft_size=self.fftl)
        ap = pw.d4c(x, f0, time_axis, self.fs, fft_size=self.fftl)
        
        assert sp.shape == ap.shape
        
        return f0, sp, ap
    
    def analyze_f0(self, x):
        """
        Analyze f0.
        
        Parameters
        ----------
        x : array, shape(`T`)
            monoral speech signal in time domain
        
        Returns
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        """

        f0, time_axis = pw.harvest(x, self.fs, f0_floor=self.minf0,
                                   f0_ceil=self.maxf0, frame_period=self.shiftms)
        
        assert f0.shape == x.shape()
        
        return f0
    
    def synthesis(self, f0, sp, ap):
        """
        Re-synthesizes a speech waveform from acoustic featueres.
        
        Parameters
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        sp : array, shape(`T`, `fftl / 2 + 1`)
            Spectral envelope sequence
        ap : array, shape(`T`, `fftl / 2 + 1`)
            aperiodicity sequence
        """

        return pw.synthesize(f0, sp, ap, self.fs, frame_period=self.shiftms)

In [4]:
class FeatureExtractor(object):
    """
    Analyze acoustic features from a waveform.
    
    This class may have several types of estimeter like WORLD or STRAIGHT.
    Default type is WORLD.
    
    Ref : https://github.com/k2kobayashi/sprocket/
    """
    
    def __init__(self, analyzer='world', fs=16000, fftl=1024, 
                 shiftms=5.0, minf0=50.0, maxf0=500.0):
        """
        Parameters
        ----------
        analyzer : str
            Analyzer
        fs : int 
            Sampling frequency 
        fftl : int
            FFT length
        shiftms : float
            Shift length [ms]
        minf0 : float
            Floor in F0 estimation
        maxf0 : float
            Seli in F0 estimation
        """
        
        self.analyzer = analyzer
        self.fs = fs
        self.fftl = fftl
        self.shiftms = shiftms
        self.minf0 = minf0
        self.maxf0 = maxf0
    
        if self.analyzer == 'world':
            self.analyzer = WORLD(fs=self.fs, fftl=self.fftl, 
                                  minf0=self.minf0, maxf0=self.maxf0, shiftms=self.shiftms)
        else:
            raise('Analyzer Error : not support type, see FeatureExtractor class.')
        
        self._f0 = None
        self._sp = None
        self._ap = None
        
    def analyze(self, x):
        """
        Analyze acoustic featueres.
        
        Parameters
        ----------
        x : array, shape(`T`)
            monoral speech signal in time domain
        
        Returns
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        sp : array, shape(`T`, `fftl / 2 + 1`)
            Spectral envelope sequence
        ap : array, shape(`T`, `fftl / 2 + 1`)
            aperiodicity sequence
        """
        
        self.x = np.array(x, dtype=np.float)
        self._f0, self._sp, self._ap = self.analyzer.analyze(self.x)
        
        # check f0 < 0
        self._f0[self._f0 < 0] = 0
        
        if np.sum(self._f0) == 0.0:
            print("Warning : F0 values are all zero.")
        
        return self._f0, self._sp, self._ap
    
    def analyze_f0(self, x):
        """
        Analyze f0.
        
        Parameters
        ----------
        x : array, shape(`T`)
            monoral speech signal in time domain
        
        Returns
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        """

        self.x = np.array(x, dtype=np.float)
        self._f0 = self.analyzer.analyze_f0(self.x)

        # check f0 < 0
        self._f0[self._f0 < 0] = 0
        
        if np.sum(self._f0) == 0.0:
            print("Warning : F0 values are all zero.")
        
        return self._f0
    
    def mcep(self, dim=24, alpha=0.42):
        """
        Convert mel-cepstrum sequence from spectral envelope.
        
        Parameters
        ----------
        dim : int
            mel-cepstrum dimension
        alpha : float
            parameter of all-path filter
        
        Returns
        ----------
        mcep : array, shape(`T`, `dim + 1`)
            mel-cepstrum sequence
        """        
        
        self._analyzed_check()
        
        return pysptk.sp2mc(self._sp, dim, alpha)
    
    def codeap(self):
        """
        """
        self._analyzed_check()
        
        return pw.code_aperiodicity(self._ap, self.fs)
    
    def npow(self):
        """
        Normalized power sequence from spectral envelope.
        
        Returns
        ----------
        npow : vector, shape(`T`, `1`)
            Normalized power sequence of the given waveform
        """
        
        self._analyzed_check()
        
        npow = np.apply_along_axis(self._spvec2pow, 1, self._sp)
        
        meanpow = np.mean(npow)
        npow = 10.0 * np.log10(npow / meanpow)
        
        return npow
    
    def _spvec2pow(self, specvec):
        """
        """
        fftl2 = len(specvec) - 1
        fftl = fftl2 * 2
        
        power = specvec[0] + specvec[fftl2]
        for k in range(1, fftl2):
            power += 2.0 * specvec[k]
        power /= fftl
        
        return power
        
    def _analyzed_check(self):
        if self._f0 is None and self._sp is None and self._ap is None:
            raise('Call FeatureExtractor.analyze() before this method.')

In [5]:
class Synthesizer(object):
    """
    Synthesize a waveform from acoustic features.
    
    Ref : https://github.com/k2kobayashi/sprocket/
    """
    def __init__(self, fs=16000, fftl=1024, shiftms=5.0):
        """
        Parameters
        ----------
        fs : int 
            Sampling frequency 
        fftl : int
            FFT length
        shiftms : float
            Shift length [ms]
        """
        
        self.fs = fs
        self.fftl = fftl
        self.shiftms = shiftms
    
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """
        Re-synthesizes a speech waveform from acoustic featueres.
        
        Parameters
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        mcep : array, shape(`T`, `dim`)
            mel-cepstrum sequence
        ap : array, shape(`T`, `fftl / 2 + 1`)
            aperiodicity sequence
        rmcep : array, shape(`T`, `dim`)
            array of reference mel-cepstrum sequence
        alpha : float
            parameter of all-path filter
            
        Returns
        ----------
        wav : array,
            syntesized waveform
        """
        
        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)
        
        sp = pysptk.mc2sp(mcep, alpha, self.fftl)
        wav = pw.synthesize(f0, sp, ap, self.fs, frame_period=self.shiftms)
        
        return wav
        
    def synthesis_diff(self, x, diffmcep, rmcep=None, alpha=0.42):
        """
        Re-synthesizes a speech waveform from acoustic featueres.
        filtering with a differential mel-cepstrum.
        
        Parameters
        ----------
        x : array, shape(`samples`)
            array of waveform sequence
        diffmcep : array, shape(`T`, `dim`)
            array of differential mel-cepstrum sequence
        rmcep : array, shape(`T`, `dim`)
            array of reference mel-cepstrum sequence
        alpha : float
            parameter of all-path filter
            
        Returns
        ----------
        wav : array,
            syntesized waveform
        """      
        
        x = x.astype(np.float64)
        dim = diffmcep.shape[1] - 1 
        shiftl = int(self.fs / 1000 * self.shiftms)
        
        if rmcep is not None:
            # power modification
            diffmcep = mod_power(rmcep + diffmcep, rmcep, alpha=alpha) - rmcep        
        
        # mc2b = transform mel-cepstrum to MLSA digital filter coefficients.
        b = np.apply_along_axis(pysptk.mc2b, 1, diffmcep, alpha)
        
        mlsa_fil = pysptk.synthesis.Synthesizer(pysptk.synthesis.MLSADF(dim, alpha=alpha),
                                                shiftl)
        wav = mlsa_fil.synthesis(x, b)
        
        return wav
    
    def synthesis_sp(self, f0, sp, ap):
        """
        Re-synthesizes a speech waveform from acoustic featueres.
        
        Parameters
        ----------
        f0 : array, shape(`T`)
            F0 sequence
        spc : array, shape(`T`, `dim`)
            mel-cepstrum sequence
        ap : array, shape(`T`, `fftl / 2 + 1`)
            aperiodicity sequence
            
        Returns
        ----------
        wav : array,
            syntesized waveform
        """      
        
        wav = pw.synthesize(f0, sp, ap, self.fs, frame_period=self.shiftms)
        
        return wav
    
def mod_power(cvmcep, rmcep, alpha=0.42, irlen=256):
    """
    power modification based on inpuulse responce

    Parameters
    ----------
    cvmcep : array, shape(`T`, `dim`)
        array of converted mel-cepstrum
    rmcep : arraym shape(`T`, `dim`)
        array of reference mel-cepstrum
    alpha : float
        parameter of all-path filter
    irlen : int
        Length for IIR filter

    Returns
    ----------
    modified_cvmcep : array, shape(`T`, `dim`)
        array of power modified converted mel-cepstrum
    """

    if rmcep.shape != cvmcep.shape:
        raise ValueError(
            "The shape of the converted and reference mel-cepstrum are different : {} / {}.format(cvmcep.shape, rmcep.shape)"
        )

    # mc2e = Compute energy from mel-cepstrum. e-option
    cv_e = pysptk.mc2e(cvmcep, alpha=alpha, irlen=irlen)
    r_e = pysptk.mc2e(rmcep, alpha=alpha, irlen=irlen)

    dpow = np.log(r_e / cv_e) / 2

    modified_cvmcep = np.copy(cvmcep)
    modified_cvmcep[:, 0] += dpow

    return modified_cvmcep

In [6]:
# def util methods
def melcd(array1, array2):
    """
    calculate mel-cepstrum distortion
    
    Parameters
    ----------
    array1, array2 : array, shape(`T`, `dim`) or shape(`dim`)
        Array of original and target.
    
    Returns
    ----------
    mcd : scala, number > 0
        Scala of mel-cepstrum distoriton
    """
    if array1.shape != array2.shape:
        raise ValueError(
            "The shape of both array are different : {} / {}.format(array1.shape,array2.shape)"
        )    
   
    if array1.ndim == 2:
        diff = array1 - array2
        mcd = 10.0 / np.log(10) * np.mean(np.sqrt(2.0 * np.sum(diff ** 2, axis=1)))
    elif array1.ndim == 1:
        diff = array1 - array2
        mcd = 10.0 / np.log(10) * np.sqrt(2.0 * np.sum(diff ** 2))
    else:
        raise ValueError("Dimension mismatch.")
        
    return mcd

def delta(data, win=[-1.0, 1.0, 0]):
    """
    calculate delta component
    
    Parameters
    ----------
    data : array, shape(`T`, `dim`)
        Array of static matrix sequence.
    win : array, shape(`3`)
        The shape of window matrix.
    
    Returns
    ----------
    delta : array, shape(`T`, `dim`)
        Array of delta matrix sequence.
    """
    
    if data.ndim == 1:
        # change vector into 1d-array
        T = len(data)
        dim = data.ndim
        data = data.reshape(T, dim)
    else:
        T, dim = data.shape
    
    win = np.array(win, dtype=np.float64)
    delta = np.zeros((T, dim))
    
    delta[0] = win[0] * data[0] + win[1] * data[1]
    delta[-1] = win[0] * data[-2] + win[1] * data[-1]
    
    for i in range(len(win)):
        delta[1:T - 1] += win[i] * delta[i:T - 2 + i]
    
    return delta

def static_delta(data, win=[-1.0, 1.0, 0]):
    """
    calculate static and delta component
    
    Parameters
    ----------
    data : array, shape(`T`, `dim`)
        Array of static matrix sequence.
    win : array, shape(`3`)
        The shape of window matrix.
    
    Returns
    ----------
    sddata : array, shape(`T`, `dim * 2`)
        Array of static and delta matrix sequence.
    """
    
    sddata = np.c_[data, delta(data, win)]
    
    assert sddata.shape[1] == data.shape[1] * 2
    
    return sddata

def construct_static_and_delta_matrix(T, D, win=[-1.0, 1.0, 0]):
    """
    calculate static and delta transformation matrix
    
    Parameters
    ----------
    T : scala, `T`
        Scala of time length
    D : scala, `D`
        Scala of the number of dimension.
    win : array, shape(`3`)
        The shape of window matrix.
    
    Returns
    ----------
    W : array, shape(`2 * D * T`, `D * T`)
        Array of static and delta transformation matrix.
    """
    
    static = [0, 1, 0]
    delta = win
    assert len(static) == len(delta)
    
    # generate full W
    DT = D * T
    ones = np.ones(DT)
    row = np.arange(2 * DT).reshape(2 * T, D) # generate serial numbers
    static_row = row[::2] # [1,2,3,4,5] => [1,3,5]
    delta_row = row[1::2] # [1,2,3,4,5] => [2,4]
    col = np.arange(DT)
    
    data = np.array([ones * static[0], ones * static[1],
                      ones * static[2], ones * delta[0],
                      ones * delta[1], ones * delta[2]]).flatten()
    row = np.array([[static_row] * 3, [delta_row] * 3]).flatten()
    col = np.array([[col - D, col, col + D] * 2]).flatten()

    # remove component at first and end frame
    valid_idx = np.logical_not(np.logical_or(col < 0, col >= DT))
    
    W = scipy.sparse.csr_matrix(
        (data[valid_idx], (row[valid_idx], col[valid_idx])), shape=(2 * DT, DT))
    W.eliminate_zeros()
    
    return W
    
def extfrm(data, npow, power_threshold=-20):
    """
    Extract frame over the power threshold
    
    Parameters
    ----------
    data : array, shape(`T`, `dim`)
        array of input data
    npow : array, shape(`T`)
        vector of normalized power sequence
    threshold : scala
        scala of power threshold [dB]
        
    Returns
    ----------
    data : array, shape(`T_ext`, `dim`)
        remaining data after extracting frame
        `T_ext` <= `T`
    """
    T = data.shape[0]
    if T != len(npow):
        raise("Length of two vectors is different.")
        
    valid_index = np.where(npow > power_threshold)
    extdata = data[valid_index]
    assert extdata.shape[0] <= T
    
    return extdata

def estimate_twf(orgdata, tardata, distance='melcd', fast=True, otflag=None):
    """
    time warping function estimator
    
    Parameters
    ----------
    orgdata : array, shape(`T_org`, `dim`)
        array of source feature
    tardata : array, shape(`T_tar`, `dim`)
        array of target feature
    distance : str 
        distance function
    fast : bool
        use fastdtw instead of dtw
    otflag : str
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length   
    
    Returns
    ----------
    twf : array, shape(`2`, `T`)
        time warping function between original and target
    """
    
    if distance == 'melcd':
        def distance_func(x, y): return melcd(x, y)
    else:
        raise ValueError('this distance method is not support.')
    
    if fast:
        _, path = fastdtw(orgdata, tardata, dist=distance_func)
        twf = np.array(path).T
    else:
        _, _, _, twf = dtw(orgdata, tardata, distance_func) 
        
    if otflag is not None:
        twf = modify_twf(twf, otflag=otflag)
    
    return twf

def align_data(org_data, tar_data, twf):
    """
    get aligned joint feature vector
    
    Parameters
    ----------
    org_data : array, shape(`T_org`, `dim_org`)
        Acoustic feature vector of original speaker
    tar_data : array, shape(`T_tar`, `dim_tar`)
        Acoustic feature vector of target speaker
    twf : array, shape(`2`, `T`)
        time warping function between original and target
        
    Returns
    ----------
    jdata : array, shape(`T_new`, `dim_org + dim_tar`)
        Joint feature vector between source and target
    """
    
    jdata = np.c_[org_data[twf[0]], tar_data[twf[1]]]
    return jdata

def modify_twf(twf, otflag=None):
    """
    align specified length
    
    Parameters
    ----------
    twf : array, shape(`2`, `T`)
        time warping function between original and target
    otflag : str
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length   
    
    Returns
    ----------
    mod_twf : array, shape(`2`, `T_new`)
        time warping function of modified alignment
    """
    
    if otflag == 'org':
        of, indice = np.unique(twf[0], return_index=True)
        mod_twf = np.c_[of, twf[1][indice]].T
    elif otflag == 'tar':
        tf, indice = np.unique(twf[1], return_index=True)
        mod_twf = np.c_[twf[0][indice], tf].T 
    
    return mod_twf

def low_cut_filter(x, fs, cutoff=70):
    """
    low cut filter
    
    Parameters
    ----------
    x : array, shape('samples')
        waveform sequence
    fs : array, int
        Sampling frequency
    cutoff : float
        cutoff frequency of low cut filter
    
    Returns
    ----------
    lct_x : array, shape('samples')
        Low cut filtered waveform sequence
    """
    
    nyquist = fs // 2
    norm_cutoff = cutoff / nyquist
    
    # low cut filter
    fil = firwin(255, norm_cutoff, pass_zero=False)
    lct_x = lfilter(fil, 1, x)
    
    return lct_x

def extsddata(data, npow, power_threshold=-20):
    """
    get power extract static and delta feature vector
    
    Parameters
    ----------
    data : array, shape(`T`, `dim`)
        acoustic feature vector
    npow : array, shape(`T`)
        normalized power vector
    power_threshold : float
        power threshold
    
    Returns
    ----------
    extsddata : array, shape(`T_new`, `dim * 2`)
        silence remove static and delta feature vector
    """
    
    extsddata = extfrm(static_delta(data), npow, power_threshold=power_threshold)
    return extsddata

def transform_jnt(array_list):
    num_files = len(array_list)
    for i in range(num_files):
        if i == 0:
            jnt = array_list[i]
        else:
            jnt = np.r_[jnt, array_list[i]]
    return jnt
    

In [7]:
class F0statistics(object):
    """
    Estimate F0 statistics and convert F0
    """
    def __init__(self):
        pass
    
    def estimate(self, f0list):
        """
        estimate F0 statistics from list of f0
        
        Parameters
        ----------
        f0list : list, shape(`f0num`)
            List of several F0 sequence
        
        Returns
        ----------
        f0stats : array, shape(`[mean, std]`)
            values of mean and standard deviation for log f0
        """
        
        n_files = len(f0list)
        for i in range(n_files):
            f0 = f0list[i]
            nonzero_indices = np.nonzero(f0)
            if i == 0:
                f0s = np.log(f0[nonzero_indices])
            else:
                f0s = np.r_[f0s, np.log(f0[nonzero_indices])]
        
        f0stats = np.array([np.mean(f0s), np.std(f0s)])
        
        return f0stats

    def convert(self, f0, orgf0stats, tarf0stats):
        """
        convert F0 based on F0 statistics
        
        Parameters
        ----------
        f0 : array, shape(`T`, `1`)
            array of F0 sequence
        orgf0stats : array, shape(`[mean, std]`)
            vectors of mean and standard deviation of log f0 for original speaker
        tarf0stats : array, shape(`[mean, std]`)
            vectors of mean and standard deviation of log f0 for target speaker
        
        Returns
        ----------
        cvf0 : array, shape(`T`, `1`)
            array of converted F0 sequence
        """
        
        # get length and dimension
        T = len(f0)
        
        # perform f0 conversion
        cvf0 = np.zeros(T)
        
        nonzero_indices  = f0 > 0
        cvf0[nonzero_indices] = np.exp((tarf0stats[1] / orgf0stats[1]) 
                                       * (np.log(f0[nonzero_indices])) 
                                       - orgf0stats[0] + tarf0stats[0])
        
        return cvf0

In [8]:
class GV(object):
    """
    Estimate statistics and perform postfilter based on the GV statistics.
    """
    def __init__(self):
        pass
    
    def estimate(self, datalist):
        """
        estimate GV statistics from list of data
        
        Parameters
        ----------
        datalist : list, shape(`num_data`)
            List of several data ([T, dim]) sequence
        
        Returns
        ----------
        gvstats : array, shape(`2`, `dim`)
            array of mean and standard deviation for GV
        """
        
        n_files = len(datalist)
        
        var = []
        for i in range(n_files):
            data = datalist[i]
            var.append(np.var(data, axis=0))
            
        # calculate vm and vv
        vm = np.mean(np.array(var), axis=0)
        vv = np.var(np.array(var), axis=0)
        gvstats = np.r_[vm, vv]
        gvstats = gvstats.reshape(2, len(vm))
        
        return gvstats

    def postfilter(self, data, gvstats, cvgvstats=None, alpha=1.0, startdim=1):
        """
        perform postfilter based on GV statistics into data
        
        Parameters
        ----------
        data : array, shape(`T`, `dim`)
            array of data sequence
        gvstats : array, shape(`2`, `dim`)
            array of mean and variance for target GV
        cvgvstats : array, shape(`2`, `dim`)
            array of mean and variance for converted GV
        alpha : float
            morphing coefficient between GV transformed data and data.
            alpha * gvpf(data) + (1 - alpha) * data
        startdim : int
            start dimension to perform GV postfilter
        
        Returns
        ----------
        filtered_data : array, shape(`T`, `data`)
            array of GV postfiltered data sequnece
        """
        
        # get length and dimension
        T, dim = data.shape
        assert gvstats is not None
        assert dim == gvstats.shape[1]
        
        # calculate statics of input data
        datamean = np.mean(data, axis=0)
        
        if cvgvstats is None:
            # use variance of the given data
            datavar = np.var(data, axis=0)
        else:
            # use variance of trained gv stats
            datavar = cvgvstats[0]
        
        # perform GV postfilter
        filterd = np.sqrt(gvstats[0, startdim:] / datavar[startdim:]) * (data[:, startdim:] - datamean[startdim:]) + datamean[startdim:]
        
        filterd_data = np.c_[data[:, :startdim], filterd]
        
        return alpha * filterd_data + (1 - alpha) * data

In [9]:
# 0. config path
__versions = "pre-stored-en"
__same_path = "./utterance/" + __versions + "/"
pre_stored_source_list = __same_path + 'pre-source/**/V01/T01/**/*.wav'
pre_stored_list = __same_path + "pre/**/V01/T01/**/*.wav"
output_path = __same_path + "output/"

# 1. estimate features
feat = FeatureExtractor()
synthesizer = Synthesizer()

org_f0list = None
org_splist = None
org_mceplist = None
org_aplist = None
org_npowlist = None
org_codeaplist = None

if os.path.exists(output_path + "_org_f0.pickle") \
    and os.path.exists(output_path + "_org_sp.pickle") \
    and os.path.exists(output_path + "_org_ap.pickle") \
    and os.path.exists(output_path + "_org_mcep.pickle") \
    and os.path.exists(output_path + "_org_npow.pickle") \
    and os.path.exists(output_path + "_org_codeap.pickle"):
        
    with open(output_path + "_org_f0.pickle", 'rb') as f:    
        org_f0list = pickle.load(f)
    with open(output_path + "_org_sp.pickle", 'rb') as f:   
        org_splist = pickle.load(f)
    with open(output_path + "_org_ap.pickle", 'rb') as f:  
        org_aplist = pickle.load(f)
    with open(output_path + "_org_mcep.pickle", 'rb') as f:  
        org_mceplist = pickle.load(f)
    with open(output_path + "_org_npow.pickle", 'rb') as f:  
        org_npowlist = pickle.load(f)
    with open(output_path + "_org_codeap.pickle", 'rb') as f:  
        org_codeaplist = pickle.load(f) 
else:
    org_f0list = []
    org_splist = []
    org_mceplist = []
    org_aplist = []
    org_npowlist = []
    org_codeaplist = []
    for files in sorted(glob.iglob(pre_stored_source_list, recursive=True)):
        wavf = files
        x, fs = sf.read(wavf)
        x = np.array(x, dtype=np.float)
        x = low_cut_filter(x, fs, cutoff=70)
        assert fs == 16000

        print("extract acoustic featuers: " + wavf)

        f0, sp, ap = feat.analyze(x)
        mcep = feat.mcep()
        npow = feat.npow()
        codeap = feat.codeap()
        #name, ext = os.path.splitext(wavf)
        #np.save(name + "_or_f0", f0)
        #np.save(name + "_or_sp", sp)
        #np.save(name + "_or_ap", ap)
        #np.save(name + "_or_mcep", mcep)
        #np.save(name + "_or_codeap", codeap)
        org_f0list.append(f0)
        org_splist.append(sp)
        org_mceplist.append(mcep)
        org_aplist.append(ap)
        org_npowlist.append(npow)
        org_codeaplist.append(codeap)

        #wav = synthesizer.synthesis(f0, mcep, ap)
        #wav = np.clip(wav, -32768, 32767)
        #sf.write(name + "_ansys.wav", wav, fs)

    with open(output_path + "_org_f0.pickle", 'wb') as f:    
        pickle.dump(org_f0list, f)
    with open(output_path + "_org_sp.pickle", 'wb') as f:   
        pickle.dump(org_splist, f)
    with open(output_path + "_org_npow.pickle", 'wb') as f:   
        pickle.dump(org_npowlist, f)
    with open(output_path + "_org_ap.pickle", 'wb') as f:  
        pickle.dump(org_aplist, f)
    with open(output_path + "_org_mcep.pickle", 'wb') as f:  
        pickle.dump(org_mceplist, f)
    with open(output_path + "_org_codeap.pickle", 'wb') as f:  
        pickle.dump(org_codeaplist, f) 

mid_f0list = None
mid_mceplist = None
mid_aplist = None
mid_npowlist = None
mid_splist = None
mid_codeaplist = None        

if os.path.exists(output_path + "_mid_f0.pickle") \
    and os.path.exists(output_path + "_mid_sp_0_.pickle") \
    and os.path.exists(output_path + "_mid_ap_0_.pickle") \
    and os.path.exists(output_path + "_mid_mcep.pickle") \
    and os.path.exists(output_path + "_mid_npow.pickle") \
    and os.path.exists(output_path + "_mid_codeap.pickle"):
        
    with open(output_path + "_mid_f0.pickle", 'rb') as f:    
        mid_f0list = pickle.load(f)
    for i in range(0, len(org_splist)*21, len(org_splist)):  
        with open(output_path + "_mid_sp_{}_.pickle".format(i), 'rb') as f:
            temp_splist = pickle.load(f)
            if mid_splist is None:
                mid_splist = temp_splist
            else:
                mid_splist = mid_splist + temp_splist
    for i in range(0, len(org_aplist)*21, len(org_aplist)):  
        with open(output_path + "_mid_ap_{}_.pickle".format(i), 'rb') as f:
            temp_aplist = pickle.load(f)
            if mid_aplist is None:
                mid_aplist = temp_aplist
            else:
                mid_aplist = mid_aplist + temp_aplist    
    with open(output_path + "_mid_mcep.pickle", 'rb') as f:  
        mid_mceplist = pickle.load(f)
    with open(output_path + "_mid_npow.pickle", 'rb') as f:  
        mid_npowlist = pickle.load(f)
    with open(output_path + "_mid_codeap.pickle", 'rb') as f:  
        mid_codeaplist = pickle.load(f) 
else:        
    mid_f0list = []
    mid_mceplist = []
    mid_aplist = []
    mid_npowlist = []
    mid_splist = []
    mid_codeaplist = []

    for files in sorted(glob.iglob(pre_stored_list, recursive=True)):
        wavf = files
        x, fs = sf.read(wavf)
        x = np.array(x, dtype=np.float)
        x = low_cut_filter(x, fs, cutoff=70)
        assert fs == 16000

        print("extract acoustic featuers: " + wavf)

        f0, sp, ap = feat.analyze(x)
        mcep = feat.mcep()
        npow = feat.npow()
        codeap = feat.codeap()
        name, ext = os.path.splitext(wavf)
        #np.save(name + "_or_f0", f0)
        #np.save(name + "_or_sp", sp)
        #np.save(name + "_or_ap", ap)
        #np.save(name + "_or_mcep", mcep)
        #np.save(name + "_or_codeap", codeap)
        mid_f0list.append(f0)
        mid_splist.append(sp)
        mid_mceplist.append(mcep)
        mid_aplist.append(ap)
        mid_npowlist.append(npow)
        mid_codeaplist.append(codeap)

        #wav = synthesizer.synthesis(f0, mcep, ap)
        #wav = np.clip(wav, -32768, 32767)
        #sf.write(name + "_ansys.wav", wav, fs)
        
    with open(output_path + "_mid_f0.pickle", 'wb') as f:
        print(f)
        pickle.dump(mid_f0list, f)
    with open(output_path + "_mid_npow.pickle", 'wb') as f:
        print(f)
        pickle.dump(mid_npowlist, f)
    for i in range(0, len(mid_splist), len(org_splist)):
        with open(output_path + "_mid_sp_{}_.pickle".format(i), 'wb') as f:   
            print(f)
            pickle.dump(mid_splist[i:i+len(org_splist)], f)
    for i in range(0, len(mid_aplist), len(org_aplist)):
        with open(output_path + "_mid_ap_{}_.pickle".format(i), 'wb') as f:
            print(f)
            pickle.dump(mid_aplist[i:i+len(org_aplist)], f)
    with open(output_path + "_mid_mcep.pickle", 'wb') as f:
        print(f)
        pickle.dump(mid_mceplist, f)
    with open(output_path + "_mid_codeap.pickle", 'wb') as f:
        print(f)
        pickle.dump(mid_codeaplist, f) 

In [10]:
class GMMTrainer(object):
    """
    this class offers the training of GMM with several types of covariance matrix.
    
    Parameters
    ----------
    n_mix : int 
        the number of mixture components of the GMM
    n_iter : int
        the number of iteration for EM algorithm
    covtype : str
        the type of covariance matrix of the GMM
        'full': full-covariance matrix
    
    Attributes
    ---------
    param : 
        sklearn-based model parameters of the GMM
    """
    
    def __init__(self, n_mix=64, n_iter=100, covtype='full'):
        self.n_mix = n_mix
        self.n_iter = n_iter
        self.covtype = covtype
        
        self.param = sklearn.mixture.GaussianMixture(n_components=self.n_mix,
                                                     covariance_type=self.covtype,
                                                     max_iter=self.n_iter)
        
    def train(self, jnt):
        """
        fit GMM parameter from given joint feature vector
        
        Parametes
        ---------
        jnt : array, shape(`T`, `jnt.shape[0]`)
            joint feature vector of original and target feature vector consisting of static and delta components
        """
        
        if self.covtype == 'full':
            self.param.fit(jnt)
            
        return
    
class GMMConvertor(object):
    """
    this class offers the several conversion techniques such as Maximum Likelihood Parameter Generation (MLPG)
    and Minimum Mean Square Error (MMSE).
    
    Parametes
    ---------
    n_mix : int
        the number of mixture components of the GMM
    covtype : str
        the type of covariance matrix of the GMM
        'full': full-covariance matrix
    gmmmode : str
        the type of the GMM for opening
        `None` : Normal Joint Density - GMM (JD-GMM)
    
    Attributes
    ---------
    param : 
        sklearn-based model parameters of the GMM
    w : shape(`n_mix`)
        vector of mixture component weight of the GMM
    jmean : shape(`n_mix`, `jnt.shape[0]`)
        Array of joint mean vector of the GMM
    jcov : shape(`n_mix`, `jnt.shape[0]`, `jnt.shape[0]`)
        array of joint covariance matrix of the GMM
    """
    
    def __init__(self, n_mix=64, covtype='full', gmmmode=None):
        self.n_mix = n_mix
        self.covtype = covtype
        self.gmmmode = gmmmode
        
    def open_from_param(self, param):
        """
        open GMM from GMMTrainer
        
        Parameters
        ----------
        param : GMMTrainer
            GMMTrainer class
        """
        
        self.param = param
        self._deploy_parameters()
        
        return
    
    def convert(self, data, cvtype='mlpg'):
        """
        convert data based on conditional probability density function
        
        Parametes
        ---------
        data : array, shape(`T`, `dim`)
            original data will be converted
        cvtype : str
            type of conversion technique
            `mlpg` : maximum likelihood parameter generation
            
        Returns
        ----------
        odata : array, shape(`T`, `dim`)
            converted data
        """
        
        # estimate parameter sequence
        cseq, wseq, mseq, covseq = self._gmmmap(data)
        
        if cvtype == 'mlpg':
            odata = self._mlpg(mseq, covseq)
        else:
            raise ValueError('please choose conversion mode in `mlpg`.')
        
        return odata
    
    def _gmmmap(self, sddata):
        # paramete for sequencial data
        T, sddim = sddata.shape
        
        # estimate posterior sequence
        wseq = self.pX.predict_proba(sddata)
        
        # estimate mixture sequence
        cseq = np.argmax(wseq, axis=1)
        
        mseq = np.zeros((T, sddim))
        covseq = np.zeros((T, sddim, sddim))
        for t in range(T):
            # read maximum likelihood mixture component in frame t
            m = cseq[t]
            
            # conditional mean vector sequence
            mseq[t] = self.meanY[m] + self.A[m] @ (sddata[t] - self.meanX[m])
            
            # conditional covariance sequence
            covseq[t] = self.cond_cov_inv[m]
        
        return cseq, wseq, mseq, covseq
    
    def _mlpg(self, mseq, covseq):
        # parameter for sequencial data
        T, sddim = mseq.shape
        
        # prepare W
        W = construct_static_and_delta_matrix(T, sddim // 2)
        
        # prepare D
        D = get_diagonal_precision_matrix(T, sddim, covseq)
        
        # calculate W'D
        WD = W.T @ D
        
        # W'DW
        WDW = WD @ W
        
        # W'Dm
        WDM = WD @ mseq.flatten()
        
        # estimate y = (W'DW)^-1 * W'Dm
        odata = scipy.sparse.linalg.spsolve(WDW, WDM, use_umfpack=False).reshape(T, sddim // 2)
        
        return odata
    
    def _deploy_parameters(self):
        # read JD-GMM parameters from self.param
        self.W = self.param.weights_
        self.jmean = self.param.means_
        self.jcov = self.param.covariances_
        
        # devide GMM parameters into source and target parameters
        sddim = self.jmean.shape[1] // 2
        self.meanX = self.jmean[:, 0:sddim]
        self.meanY = self.jmean[:, sddim:]
        self.covXX = self.jcov[:, :sddim, :sddim]
        self.covXY = self.jcov[:, :sddim, sddim:]
        self.covYX = self.jcov[:, sddim:, :sddim]
        self.covYY = self.jcov[:, sddim:, sddim:]
        
        # change model parameter of GMM into that of gmmmode
        if self.gmmmode is None:
            pass
        else:
            raise ValueError('please choose GMM mode in [None]')
            
        # estimate parameters for conversion
        self._set_Ab()
        self._set_pX()
        
        return
    
    def _set_Ab(self):
        # calculate A and b from self.jmean, self.jcov
        sddim = self.jmean.shape[1] // 2
        
        # calculate inverse covariance for covariance XX in each mixture
        self.covXXinv = np.zeros((self.n_mix, sddim, sddim))
        for m in range(self.n_mix):
            self.covXXinv[m] = np.linalg.inv(self.covXX[m])
            
        # calculate A, b, and conditional covariance given X
        self.A = np.zeros((self.n_mix, sddim, sddim))
        self.b = np.zeros((self.n_mix, sddim))
        self.cond_cov_inv = np.zeros((self.n_mix, sddim, sddim))
        for m in range(self.n_mix):
            # calculate A (A = yxcov_m * xxcov_m^-1)
            self.A[m] = self.covYX[m] @ self.covXXinv[m]
            
            # calculate b (b = mean^Y - A * mean^X)
            self.b[m] = self.meanY[m] - self.A[m] @ self.meanX[m]
            
            # calculate conditional covariance (cov^(Y|X)^-1 = (yycov - A * xycov)^-1)
            self.cond_cov_inv[m] = np.linalg.inv(self.covYY[m] - self.A[m] @ self.covXY[m])
            
        return
    
    def _set_pX(self):
        # probability density function of X 
        self.pX = sklearn.mixture.GaussianMixture(n_components=self.n_mix, covariance_type=self.covtype)
        self.pX.weights_ = self.W
        self.pX.means_ = self.meanX
        self.pX.covariances_ = self.covXX
        
        # following function is required to estimate porsterior
        # p(x | \lambda^(X))
        self.pX.precisions_cholesky_ = _compute_precision_cholesky(self.covXX, self.covtype)
        
        return
    
def get_diagonal_precision_matrix(T, D, covseq):
    return scipy.sparse.block_diag(covseq, format='csr')        

In [11]:
def get_alignment(odata, onpow, tdata, tnpow, opow=-20, tpow=-20, sd=0, cvdata=None, given_twf=None, otflag=None, distance='melcd'):
    """
    get alignment between original and target.
    
    Parameters
    ----------
    odata : array, shape(`T`, `dim`)
        acoustic feature vector of original
    onpow : array, shape(`T`)
        Normalized power vector of original
    tdata : array, shape(`T`, `dim`)
        acoustic feature vector of target
    tnpow : array, shape(`T`)
        Normalized power vector of target
    opow : float
        power threshold of original
    tpow : float
        power threshold of target
    sd : int
        start dimension to be used for alignment
    cvdata : array, shape(`T`, `dim`)
        converted original data
    given_twf : array, shape(`T_new`, `dim * 2`)
        Alignment given twf
    otflag : str
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length
    distance : str
        Distance function to be used
    
    Returns
    ----------
    jdata : array, shape(`T_new`, `dim * 2`)
        joint static and delta feature vector
    twf : array, shape(`T_new`, `dim * 2`)
        Time warping function
    mcd : float
        Mel-cepstrum distortion between arrays
    """
    
    oexdata = extsddata(odata[:, sd:], onpow, power_threshold=opow)
    texdata = extsddata(tdata[:, sd:], tnpow, power_threshold=tpow)
    
    if cvdata is None:
        align_odata = oexdata
    else:
        cvexdata = extsddata(cvdata, onpow, power_threshold=opow)
        align_odata = cvexdata
    
    if given_twf is None: 
        twf = estimate_twf(align_odata, texdata, distance=distance, otflag=otflag)
    else:
        twf = given_twf
    
    jdata = align_data(oexdata, texdata, twf)
    mcd = melcd(align_odata[twf[0]], texdata[twf[1]])
    
    return jdata, twf, mcd

def align_feature_vectors(odata, onpows, tdata, tnpows, opow=-100, tpow=-100, itnum=3, sd=0, given_twfs=None, otflag=None):
    """
    get alignment to create joint feature vector
    
    Parameters
    ----------
    odata : list, (`num_files`)
        List of original feature vectors
    onpow : list, (`num_files`)
        List of original npows
    tdata : list, (`num_files`)
        List of target feature vectors
    tnpow : list, (`num_files`)
        List of target npows
    opow : float
        power threshold of original
    tpow : float
        power threshold of target
    itnum : int
        the number of iteration
    sd : int
        start dimension of feature vector to be used for alignment
    given_twf : array, shape(`T_new`, `dim * 2`)
        use given alignment while 1st iteration
    otflag : str
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length
    distance : str
        Distance function to be used
    
    Returns
    ----------
    jdata : array, shape(`T_new`, `dim * 2`)
        joint static and delta feature vector
    twf : array, shape(`T_new`, `dim * 2`)
        Time warping function
    mcd : float
        Mel-cepstrum distortion between arrays
    """
    
    it = 1
    num_files = len(odata)
    cvgmm, cvdata = None, None
    for it in range(1, itnum+1):
        print('{}-th joint feature extraction starts.'.format(it))
        
        # alignment
        twfs, jfvs = [], []
        for i in range(num_files):
            if it == 1 and given_twfs is not None:
                gtwf = given_twfs[i]
            else:
                gtwf = None
            
            if it > 1:
                cvdata = cvgmm.convert(static_delta(odata[i][:, sd:]))
            
            jdata, twf, mcd = get_alignment(odata[i], onpows[i], tdata[i], tnpows[i], opow=opow, tpow=tpow,
                                            sd=sd, cvdata=cvdata, given_twf=gtwf, otflag=otflag)
            twfs.append(twf)
            jfvs.append(jdata)
            print('distortion [dB] for {}-th file: {}'.format(i+1, mcd))
            
        jnt_data = transform_jnt(jfvs)
        
        if it != itnum:
            # train GMM, if not final iteration
            datagmm = GMMTrainer()
            datagmm.train(jnt_data)
            cvgmm = GMMConvertor()
            cvgmm.open_from_param(datagmm.param)
        it += 1
    return jfvs, twfs    

In [12]:
# 2. estimate twf and jnt
if os.path.exists(output_path + "_jnt_mcep_0_.pickle") \
    and os.path.exists(output_path + "_jnt_codeap_0_.pickle"):
    pass
else:
    for i in range(0, len(mid_mceplist), len(org_mceplist)):
        org_mceps = org_mceplist
        org_npows = org_npowlist
        mid_mceps = mid_mceplist[i:i+len(org_mceps)]
        mid_npows = mid_npowlist[i:i+len(org_npows)]
        assert len(org_mceps) == len(mid_mceps)
        assert len(org_npows) == len(mid_npows)
        assert len(org_mceps) == len(org_npows)

        # dtw between original and target 0-th and silence
        print("## alignment mcep 0-th and silence ##")
        jmceps, twfs = align_feature_vectors(org_mceps, org_npows, mid_mceps, mid_npows, opow=-100, tpow=-100, sd=1)
        jnt_mcep = transform_jnt(jmceps)

        # save joint feature vectors
        with open(output_path + "_jnt_mcep_{}_.pickle".format(i), 'wb') as f:   
            print(f)
            pickle.dump(jnt_mcep, f)

## alignment mcep 0-th and silence ##
1-th joint feature extraction starts.
distortion [dB] for 1-th file: 8.542130892700015
distortion [dB] for 2-th file: 8.312001538246323
distortion [dB] for 3-th file: 8.531255963903506
distortion [dB] for 4-th file: 7.66710609894504
distortion [dB] for 5-th file: 8.003577954040203
distortion [dB] for 6-th file: 8.360249587096318
distortion [dB] for 7-th file: 7.60659806453102
distortion [dB] for 8-th file: 8.104361261880559
distortion [dB] for 9-th file: 7.8794670229531825
distortion [dB] for 10-th file: 8.450906220998318
distortion [dB] for 11-th file: 8.014540631938347
distortion [dB] for 12-th file: 7.256262290483638
distortion [dB] for 13-th file: 7.907423897092188
distortion [dB] for 14-th file: 8.360717839325577
distortion [dB] for 15-th file: 8.631758621756113
distortion [dB] for 16-th file: 8.076467604942382
distortion [dB] for 17-th file: 8.326154903975713
distortion [dB] for 18-th file: 8.011493248519804
distortion [dB] for 19-th file: 8.

distortion [dB] for 22-th file: 8.937196286065817
distortion [dB] for 23-th file: 9.480159461010082
distortion [dB] for 24-th file: 9.399641467640851
distortion [dB] for 25-th file: 9.623047129922105
distortion [dB] for 26-th file: 10.639840396151518
distortion [dB] for 27-th file: 9.50112735323768
distortion [dB] for 28-th file: 9.480784740244557
distortion [dB] for 29-th file: 9.342639209453889
distortion [dB] for 30-th file: 9.901047377296438
distortion [dB] for 31-th file: 9.841711899086249
distortion [dB] for 32-th file: 9.266269559874184
distortion [dB] for 33-th file: 8.569171130682884
distortion [dB] for 34-th file: 9.410880691622355
distortion [dB] for 35-th file: 8.246442800236856
distortion [dB] for 36-th file: 10.110835155845006
distortion [dB] for 37-th file: 8.196970123244986
distortion [dB] for 38-th file: 9.918089454413321
distortion [dB] for 39-th file: 9.946772838428378
distortion [dB] for 40-th file: 9.69922166877272
distortion [dB] for 41-th file: 9.244252562204974


distortion [dB] for 45-th file: 8.94120216475458
distortion [dB] for 46-th file: 8.375415857204073
2-th joint feature extraction starts.
distortion [dB] for 1-th file: 9.171793637545333
distortion [dB] for 2-th file: 9.020068452743566
distortion [dB] for 3-th file: 8.937645183131686
distortion [dB] for 4-th file: 8.21034629244563
distortion [dB] for 5-th file: 10.143204789964852
distortion [dB] for 6-th file: 9.766875401804928
distortion [dB] for 7-th file: 9.507792025795728
distortion [dB] for 8-th file: 9.064616953585373
distortion [dB] for 9-th file: 9.220044473533743
distortion [dB] for 10-th file: 9.859398514330453
distortion [dB] for 11-th file: 8.954132934611694
distortion [dB] for 12-th file: 8.87852335412455
distortion [dB] for 13-th file: 9.14685286634935
distortion [dB] for 14-th file: 9.082447659160508
distortion [dB] for 15-th file: 9.331823686034182
distortion [dB] for 16-th file: 8.43719481397493
distortion [dB] for 17-th file: 8.459715605597523
distortion [dB] for 18-th

distortion [dB] for 21-th file: 8.857149500872023
distortion [dB] for 22-th file: 8.71958160807596
distortion [dB] for 23-th file: 8.869291481377248
distortion [dB] for 24-th file: 8.486982142087559
distortion [dB] for 25-th file: 8.566391518396847
distortion [dB] for 26-th file: 9.438702093345256
distortion [dB] for 27-th file: 9.814699894007937
distortion [dB] for 28-th file: 9.640613667871907
distortion [dB] for 29-th file: 9.214769704892113
distortion [dB] for 30-th file: 9.631421097692849
distortion [dB] for 31-th file: 8.765942056537032
distortion [dB] for 32-th file: 8.72989948557993
distortion [dB] for 33-th file: 9.382427963171837
distortion [dB] for 34-th file: 8.03394870359265
distortion [dB] for 35-th file: 8.27830368624696
distortion [dB] for 36-th file: 8.646282082342285
distortion [dB] for 37-th file: 7.475974081939233
distortion [dB] for 38-th file: 8.672040459737346
distortion [dB] for 39-th file: 8.896090596435961
distortion [dB] for 40-th file: 9.184622525656371
dist

distortion [dB] for 43-th file: 7.516497389450508
distortion [dB] for 44-th file: 9.470256946456802
distortion [dB] for 45-th file: 9.513240570866822
distortion [dB] for 46-th file: 8.372482057788975
3-th joint feature extraction starts.
distortion [dB] for 1-th file: 11.439226581925132
distortion [dB] for 2-th file: 9.710428287141024
distortion [dB] for 3-th file: 10.858370816104426
distortion [dB] for 4-th file: 8.653082020919257
distortion [dB] for 5-th file: 10.770560532069297
distortion [dB] for 6-th file: 10.631930087656874
distortion [dB] for 7-th file: 9.808289102788729
distortion [dB] for 8-th file: 10.503493359285464
distortion [dB] for 9-th file: 11.28534278562277
distortion [dB] for 10-th file: 10.58193106976737
distortion [dB] for 11-th file: 10.577213308991661
distortion [dB] for 12-th file: 9.044639889926849
distortion [dB] for 13-th file: 9.6508044967764
distortion [dB] for 14-th file: 10.293466050850038
distortion [dB] for 15-th file: 9.083772581332955
distortion [dB] 

distortion [dB] for 18-th file: 10.388473677298842
distortion [dB] for 19-th file: 10.726384646263687
distortion [dB] for 20-th file: 9.476383803239843
distortion [dB] for 21-th file: 10.368729859226685
distortion [dB] for 22-th file: 10.032766114343382
distortion [dB] for 23-th file: 9.718022338800516
distortion [dB] for 24-th file: 9.025763811679251
distortion [dB] for 25-th file: 9.616163367006491
distortion [dB] for 26-th file: 10.336010907745667
distortion [dB] for 27-th file: 10.9923045779366
distortion [dB] for 28-th file: 9.888935434067703
distortion [dB] for 29-th file: 9.997489272432903
distortion [dB] for 30-th file: 10.267699026594805
distortion [dB] for 31-th file: 10.295776044095758
distortion [dB] for 32-th file: 10.014867013747153
distortion [dB] for 33-th file: 10.993104477550824
distortion [dB] for 34-th file: 8.916688205922007
distortion [dB] for 35-th file: 9.917179963698649
distortion [dB] for 36-th file: 9.674579387687691
distortion [dB] for 37-th file: 9.34749869

distortion [dB] for 40-th file: 8.710655547185207
distortion [dB] for 41-th file: 8.090860395632255
distortion [dB] for 42-th file: 8.865755524730208
distortion [dB] for 43-th file: 7.646807746534732
distortion [dB] for 44-th file: 9.733552655782237
distortion [dB] for 45-th file: 8.905623544744502
distortion [dB] for 46-th file: 9.15865840599329
<_io.BufferedWriter name='./utterance/pre-stored-en/output/_jnt_mcep_276_.pickle'>
## alignment mcep 0-th and silence ##
1-th joint feature extraction starts.
distortion [dB] for 1-th file: 8.901990269573314
distortion [dB] for 2-th file: 8.391936284092752
distortion [dB] for 3-th file: 8.993533598288801
distortion [dB] for 4-th file: 8.04937075203933
distortion [dB] for 5-th file: 8.996125206584235
distortion [dB] for 6-th file: 8.410937678119758
distortion [dB] for 7-th file: 8.301827467510856
distortion [dB] for 8-th file: 8.570627161348472
distortion [dB] for 9-th file: 8.333843185663904
distortion [dB] for 10-th file: 8.403128405321903
di

distortion [dB] for 13-th file: 8.247595912796113
distortion [dB] for 14-th file: 8.785611651217456
distortion [dB] for 15-th file: 8.61363450491579
distortion [dB] for 16-th file: 8.331055037506168
distortion [dB] for 17-th file: 8.23131141964661
distortion [dB] for 18-th file: 8.569064076203697
distortion [dB] for 19-th file: 8.17327708962225
distortion [dB] for 20-th file: 8.342451320553863
distortion [dB] for 21-th file: 8.012786333298683
distortion [dB] for 22-th file: 8.234979362423866
distortion [dB] for 23-th file: 9.097289320066322
distortion [dB] for 24-th file: 8.450607193966487
distortion [dB] for 25-th file: 8.548959070683575
distortion [dB] for 26-th file: 7.902518203360369
distortion [dB] for 27-th file: 8.45181503800092
distortion [dB] for 28-th file: 8.824986605905888
distortion [dB] for 29-th file: 8.613839439710103
distortion [dB] for 30-th file: 7.93662435165991
distortion [dB] for 31-th file: 8.387407005074595
distortion [dB] for 32-th file: 8.262781336719462
disto

distortion [dB] for 36-th file: 8.427760825050065
distortion [dB] for 37-th file: 7.6293962161569695
distortion [dB] for 38-th file: 8.359971318102087
distortion [dB] for 39-th file: 8.771254375233434
distortion [dB] for 40-th file: 8.916661706464186
distortion [dB] for 41-th file: 8.31034150811999
distortion [dB] for 42-th file: 8.33297529212321
distortion [dB] for 43-th file: 8.257217110241372
distortion [dB] for 44-th file: 7.850995140936306
distortion [dB] for 45-th file: 8.663808693139096
distortion [dB] for 46-th file: 8.394863519289059
2-th joint feature extraction starts.
distortion [dB] for 1-th file: 10.520764740420482
distortion [dB] for 2-th file: 9.728807901066853
distortion [dB] for 3-th file: 10.03189846960544
distortion [dB] for 4-th file: 8.091155803658063
distortion [dB] for 5-th file: 9.88334813579592
distortion [dB] for 6-th file: 10.040676073432588
distortion [dB] for 7-th file: 9.005281571342767
distortion [dB] for 8-th file: 9.983626002961241
distortion [dB] for 

distortion [dB] for 12-th file: 7.5352394478385065
distortion [dB] for 13-th file: 8.215282714497667
distortion [dB] for 14-th file: 8.554219523576911
distortion [dB] for 15-th file: 8.338912545973939
distortion [dB] for 16-th file: 9.068489893657405
distortion [dB] for 17-th file: 8.440272281457144
distortion [dB] for 18-th file: 7.80076117880247
distortion [dB] for 19-th file: 8.563024062177046
distortion [dB] for 20-th file: 8.049905480552296
distortion [dB] for 21-th file: 8.584249024451292
distortion [dB] for 22-th file: 8.015427740444165
distortion [dB] for 23-th file: 8.709119012453144
distortion [dB] for 24-th file: 8.819588063950977
distortion [dB] for 25-th file: 8.727597418460922
distortion [dB] for 26-th file: 9.21063565782477
distortion [dB] for 27-th file: 8.894838468794502
distortion [dB] for 28-th file: 8.502921457712453
distortion [dB] for 29-th file: 7.917773329828708
distortion [dB] for 30-th file: 8.03194446338335
distortion [dB] for 31-th file: 8.544669833878254
di

distortion [dB] for 35-th file: 8.1010425873004
distortion [dB] for 36-th file: 8.42325090832463
distortion [dB] for 37-th file: 8.803683437720984
distortion [dB] for 38-th file: 10.26748219093197
distortion [dB] for 39-th file: 8.800129962906666
distortion [dB] for 40-th file: 9.119275975255517
distortion [dB] for 41-th file: 9.546155364482923
distortion [dB] for 42-th file: 9.739952207897618
distortion [dB] for 43-th file: 8.463475881578889
distortion [dB] for 44-th file: 8.975072996983698
distortion [dB] for 45-th file: 8.285566909541194
distortion [dB] for 46-th file: 8.64275026373952
3-th joint feature extraction starts.
distortion [dB] for 1-th file: 9.856403548224892
distortion [dB] for 2-th file: 9.206456402146326
distortion [dB] for 3-th file: 9.316220113016758
distortion [dB] for 4-th file: 7.917430093141187
distortion [dB] for 5-th file: 9.290717238477951
distortion [dB] for 6-th file: 9.327849731166706
distortion [dB] for 7-th file: 9.755311868668441
distortion [dB] for 8-t

distortion [dB] for 11-th file: 9.037141303236492
distortion [dB] for 12-th file: 8.129208146490562
distortion [dB] for 13-th file: 9.329104983849518
distortion [dB] for 14-th file: 9.64548581469559
distortion [dB] for 15-th file: 9.324823827743957
distortion [dB] for 16-th file: 9.489359670105841
distortion [dB] for 17-th file: 9.072753095463346
distortion [dB] for 18-th file: 8.382657287600818
distortion [dB] for 19-th file: 10.568674497634344
distortion [dB] for 20-th file: 9.975157836375896
distortion [dB] for 21-th file: 10.177158313673226
distortion [dB] for 22-th file: 9.907604383217503
distortion [dB] for 23-th file: 8.900699258998483
distortion [dB] for 24-th file: 10.283471668064111
distortion [dB] for 25-th file: 8.828276017773113
distortion [dB] for 26-th file: 10.487367440967434
distortion [dB] for 27-th file: 9.633416878352735
distortion [dB] for 28-th file: 10.031444811216316
distortion [dB] for 29-th file: 9.61696401272754
distortion [dB] for 30-th file: 9.6101328101665

distortion [dB] for 33-th file: 10.506460247629052
distortion [dB] for 34-th file: 8.728080771612802
distortion [dB] for 35-th file: 9.445106667433572
distortion [dB] for 36-th file: 8.658077245356683
distortion [dB] for 37-th file: 9.587882403252717
distortion [dB] for 38-th file: 10.138832892851296
distortion [dB] for 39-th file: 9.275616491634363
distortion [dB] for 40-th file: 10.123422100486064
distortion [dB] for 41-th file: 9.765740773088352
distortion [dB] for 42-th file: 10.397514257609554
distortion [dB] for 43-th file: 8.549395900615293
distortion [dB] for 44-th file: 9.337055054062189
distortion [dB] for 45-th file: 9.536262387899717
distortion [dB] for 46-th file: 8.62576661320915
<_io.BufferedWriter name='./utterance/pre-stored-en/output/_jnt_mcep_598_.pickle'>
## alignment mcep 0-th and silence ##
1-th joint feature extraction starts.
distortion [dB] for 1-th file: 7.917564290961795
distortion [dB] for 2-th file: 7.566710425507948
distortion [dB] for 3-th file: 7.9970795

distortion [dB] for 6-th file: 8.012164036867082
distortion [dB] for 7-th file: 7.190491742791137
distortion [dB] for 8-th file: 8.405499009595774
distortion [dB] for 9-th file: 7.70499161243113
distortion [dB] for 10-th file: 7.851060197354645
distortion [dB] for 11-th file: 7.852300175945635
distortion [dB] for 12-th file: 7.158189200021275
distortion [dB] for 13-th file: 8.016508110253115
distortion [dB] for 14-th file: 7.640437789465897
distortion [dB] for 15-th file: 9.15337835664778
distortion [dB] for 16-th file: 8.236614291895194
distortion [dB] for 17-th file: 7.6651448528688
distortion [dB] for 18-th file: 8.056068109080618
distortion [dB] for 19-th file: 7.6103868867833935
distortion [dB] for 20-th file: 8.338573094976592
distortion [dB] for 21-th file: 7.656780046362011
distortion [dB] for 22-th file: 7.685927211719029
distortion [dB] for 23-th file: 7.719950793712398
distortion [dB] for 24-th file: 8.2369223651952
distortion [dB] for 25-th file: 7.988894551445222
distortio

distortion [dB] for 28-th file: 8.213357676021069
distortion [dB] for 29-th file: 7.610083437750692
distortion [dB] for 30-th file: 7.972126596365385
distortion [dB] for 31-th file: 7.9532802326919905
distortion [dB] for 32-th file: 8.104745365883135
distortion [dB] for 33-th file: 7.696944618479891
distortion [dB] for 34-th file: 8.860505323650072
distortion [dB] for 35-th file: 7.4419145647939065
distortion [dB] for 36-th file: 8.769991214615807
distortion [dB] for 37-th file: 6.997925347351025
distortion [dB] for 38-th file: 9.292059148792335
distortion [dB] for 39-th file: 7.724928350892212
distortion [dB] for 40-th file: 8.571502511306884
distortion [dB] for 41-th file: 8.307765092976545
distortion [dB] for 42-th file: 8.835629678515707
distortion [dB] for 43-th file: 7.930758897212663
distortion [dB] for 44-th file: 7.739528089236484
distortion [dB] for 45-th file: 7.694637663321471
distortion [dB] for 46-th file: 8.612441402891752
2-th joint feature extraction starts.
distortion

distortion [dB] for 3-th file: 9.207710121503036
distortion [dB] for 4-th file: 9.65585736137212
distortion [dB] for 5-th file: 9.770630030470933
distortion [dB] for 6-th file: 9.825331770618725
distortion [dB] for 7-th file: 9.24563171330555
distortion [dB] for 8-th file: 9.455922837020086
distortion [dB] for 9-th file: 9.715071464424131
distortion [dB] for 10-th file: 9.574789384617278
distortion [dB] for 11-th file: 9.34365068967517
distortion [dB] for 12-th file: 9.191758914381497
distortion [dB] for 13-th file: 8.907407218471471
distortion [dB] for 14-th file: 9.112867675817716
distortion [dB] for 15-th file: 9.47640178853494
distortion [dB] for 16-th file: 9.441115526635599
distortion [dB] for 17-th file: 9.431694613343211
distortion [dB] for 18-th file: 8.81462265056672
distortion [dB] for 19-th file: 10.64840377500499
distortion [dB] for 20-th file: 8.686511982706875
distortion [dB] for 21-th file: 9.449747474741242
distortion [dB] for 22-th file: 9.267611288842781
distortion [

distortion [dB] for 25-th file: 8.542789447873393
distortion [dB] for 26-th file: 8.380546447820729
distortion [dB] for 27-th file: 9.133206772060674
distortion [dB] for 28-th file: 8.601664207088142
distortion [dB] for 29-th file: 8.48766557986965
distortion [dB] for 30-th file: 8.991218904329967
distortion [dB] for 31-th file: 8.665429390820838
distortion [dB] for 32-th file: 8.273718101812355
distortion [dB] for 33-th file: 9.255003456482447
distortion [dB] for 34-th file: 7.994514924510437
distortion [dB] for 35-th file: 9.984643268660363
distortion [dB] for 36-th file: 9.081300246692363
distortion [dB] for 37-th file: 8.198038693355471
distortion [dB] for 38-th file: 8.838153747368379
distortion [dB] for 39-th file: 8.813732338047084
distortion [dB] for 40-th file: 9.474804748979526
distortion [dB] for 41-th file: 8.721736396888623
distortion [dB] for 42-th file: 8.988357464192948
distortion [dB] for 43-th file: 7.359372482092136
distortion [dB] for 44-th file: 9.274930789291242
d

3-th joint feature extraction starts.
distortion [dB] for 1-th file: 9.829017479258674
distortion [dB] for 2-th file: 9.251184061173516
distortion [dB] for 3-th file: 9.794742257167396
distortion [dB] for 4-th file: 7.250349697825209
distortion [dB] for 5-th file: 9.44511125166918
distortion [dB] for 6-th file: 10.306585538003317
distortion [dB] for 7-th file: 8.721691550954361
distortion [dB] for 8-th file: 9.430375962747343
distortion [dB] for 9-th file: 8.445043541219652
distortion [dB] for 10-th file: 10.075391460642871
distortion [dB] for 11-th file: 8.125214112966663
distortion [dB] for 12-th file: 9.013346125320513
distortion [dB] for 13-th file: 9.5416078777823
distortion [dB] for 14-th file: 10.951210588494263
distortion [dB] for 15-th file: 8.635961701896367
distortion [dB] for 16-th file: 10.262656261953097
distortion [dB] for 17-th file: 9.626817532564754
distortion [dB] for 18-th file: 9.566361240341624
distortion [dB] for 19-th file: 8.774730382091521
distortion [dB] for 

distortion [dB] for 23-th file: 9.40754256428688
distortion [dB] for 24-th file: 9.367299686598333
distortion [dB] for 25-th file: 9.512995746697609
distortion [dB] for 26-th file: 9.203758536513451
distortion [dB] for 27-th file: 9.78499742082111
distortion [dB] for 28-th file: 8.813898344859755
distortion [dB] for 29-th file: 9.197719026446132
distortion [dB] for 30-th file: 8.967263619685346
distortion [dB] for 31-th file: 9.292396539322962
distortion [dB] for 32-th file: 10.578170868638452
distortion [dB] for 33-th file: 9.626738428113383
distortion [dB] for 34-th file: 8.656210743405376
distortion [dB] for 35-th file: 10.518458724626452
distortion [dB] for 36-th file: 9.231602272445958
distortion [dB] for 37-th file: 8.110156887326369
distortion [dB] for 38-th file: 8.9761174343246
distortion [dB] for 39-th file: 8.98159147272117
distortion [dB] for 40-th file: 10.341983694419884
distortion [dB] for 41-th file: 8.911525581451546
distortion [dB] for 42-th file: 10.215681691214431
d

distortion [dB] for 45-th file: 8.941956945819264
distortion [dB] for 46-th file: 8.195628713812349
<_io.BufferedWriter name='./utterance/pre-stored-en/output/_jnt_mcep_966_.pickle'>


In [None]:
# 3. make EV-GMM
initgmm, initgmm_codeap = None, None
if os.path.exists(output_path + "initgmm.pickle"):
    with open(output_path + "initgmm.pickle", 'rb') as f:  
        initgmm = pickle.load(f)
else:
    jnt, jnt_codeap = None, []
    for i in range(0, len(mid_mceplist), len(org_mceplist)):
        with open(output_path + "_jnt_mcep_{}_.pickle".format(i), 'rb') as f:  
            temp_jnt = pickle.load(f)
            if jnt is None:
                jnt = temp_jnt
            else:
                jnt = np.r_[jnt, temp_jnt]
        with open(output_path + "_jnt_codeap_{}_.pickle".format(i), 'rb') as f:  
            temp_jnt_codeap = pickle.load(f)
            jnt_codeap.append(temp_jnt_codeap)

    jnt_codeap = transform_jnt(jnt_codeap)
    assert jnt.shape[0] == jnt_codeap.shape[0]

    # train initial gmm
    initgmm = GMMTrainer()
    initgmm.train(jnt)

    initgmm_codeap = GMMTrainer()
    initgmm_codeap.train(jnt_codeap)
    with open(output_path + "initgmm.pickle", 'wb') as f:   
        print(f)
        pickle.dump(initgmm, f)
    with open(output_path + "initgmm_codeap.pickle", 'wb') as f:   
        print(f)
        pickle.dump(initgmm_codeap, f)

In [None]:
# get initial gmm params
init_W = initgmm.param.weights_
init_jmean = initgmm.param.means_
init_jcov = initgmm.param.covariances_
sddim = init_jmean.shape[1] // 2
init_meanX = init_jmean[:, :sddim]
init_meanY = init_jmean[:, sddim:]
init_covXX = init_jcov[:, :sddim, :sddim]
init_covXY = init_jcov[:, :sddim, sddim:]
init_covYX = init_jcov[:, sddim:, :sddim]
init_covYY = init_jcov[:, sddim:, sddim:]
fitted_source = init_meanX
fitted_target = init_meanY

# get initial codeap gmm params
init_W_codeap = initgmm_codeap.param.weights_
init_jmean_codeap = initgmm_codeap.param.means_
init_jcov_codeap = initgmm_codeap.param.covariances_
sddim_codeap = init_jmean_codeap.shape[1] // 2
init_meanX_codeap = init_jmean_codeap[:, :sddim_codeap]
init_meanY_codeap = init_jmean_codeap[:, sddim_codeap:]
init_covXX_codeap = init_jcov_codeap[:, :sddim_codeap, :sddim_codeap]
init_covXY_codeap = init_jcov_codeap[:, :sddim_codeap, sddim_codeap:]
init_covYX_codeap = init_jcov_codeap[:, sddim_codeap:, :sddim_codeap]
init_covYY_codeap = init_jcov_codeap[:, sddim_codeap:, sddim_codeap:]
fitted_source_codeap = init_meanX_codeap
fitted_target_codeap = init_meanY_codeap

In [None]:
sv, sv_codeap = None, None
if os.path.exists(output_path +  "_sv.npy") \
    and os.path.exists(output_path + "_sv_codeap.npy"):
    sv = np.array(sv)
    sv_codeap = np.array(sv_codeap)
    sv = np.load(output_path + '_sv.npy')
    sv_codeap = np.load(output_path + '_sv_codeap.npy')
    
else:
    depengmm, depengmm_codeap = None, None
    depenjnt, depenjnt_codeap = None, None
    sv, sv_codeap = [], []
    for i in range(0, len(mid_mceplist), len(org_mceplist)):
        with open(output_path + "_jnt_mcep_{}_.pickle".format(i), 'rb') as f:  
            depenjnt = pickle.load(f)
            depengmm = GMMTrainer()
            depengmm.param.weights_ = init_W
            depengmm.param.means_ = init_jmean
            depengmm.param.covariances_ = init_jcov
            depengmm.train(depenjnt)
            sv.append(depengmm.param.means_)
        with open(output_path + "_jnt_codeap_{}_.pickle".format(i), 'rb') as f:  
            depenjnt_codeap = pickle.load(f)
            depengmm_codeap = GMMTrainer()
            depengmm_codeap.param.weights_ = init_W_codeap
            depengmm_codeap.param.means_ = init_jmean_codeap
            depengmm_codeap.param.covariances_ = init_jcov_codeap
            depengmm_codeap.train(depenjnt_codeap)
            sv_codeap.append(depengmm_codeap.param.means_)
    sv = np.array(sv)
    sv_codeap = np.array(sv_codeap)
    np.save(output_path + "_sv", sv)
    np.save(output_path + "_sv_codeap", sv_codeap)
print(sv.shape)

In [None]:
# PCA
decomp = 21 # decomposition dim
n_mix = 64
S = int(len(mid_mceplist) / len(org_mceplist))
assert S == 22

source_pca = sklearn.decomposition.PCA()
source_pca.fit(sv[:,:,:sddim].reshape((S, n_mix * sddim)))

target_pca = sklearn.decomposition.PCA()
target_pca.fit(sv[:,:,sddim:].reshape((S, n_mix * sddim)))

eigenvectors = source_pca.components_.reshape((n_mix, sddim, S)), target_pca.components_.reshape((n_mix, sddim, S))
biasvectors = source_pca.mean_.reshape((n_mix, sddim)), target_pca.mean_.reshape((n_mix, sddim))

In [None]:
print(sv_codeap[:,:,3:].shape)

In [None]:
source_pca_codeap = sklearn.decomposition.PCA()
source_pca_codeap.fit(sv_codeap[:,:,:sddim_codeap].reshape((S, n_mix * sddim_codeap)))

target_pca_codeap = sklearn.decomposition.PCA()
target_pca_codeap.fit(sv_codeap[:,:,sddim_codeap:].reshape((S, n_mix * sddim_codeap)))

eigenvectors_codeap = source_pca_codeap.components_.reshape((n_mix, sddim_codeap, S)), target_pca_codeap.components_.reshape((n_mix, sddim_codeap, S))
biasvectors_codeap = source_pca_codeap.mean_.reshape((n_mix, sddim_codeap)), target_pca_codeap.mean_.reshape((n_mix, sddim_codeap))

In [None]:
# 4. estimate statistic features.
for_convert_source = __same_path + 'input/EJM10/V01/T01/TIMIT/000/*.wav' 
for_convert_target = __same_path + 'adaptation/EJF01/V01/T01/ATR503/A/*.wav'

print("## estimate static fetures ##")

src_f0list = []
src_splist = []
src_mceplist = []
src_aplist = []
src_npowlist = []
src_codeaplist = []
for files in sorted(glob.iglob(for_convert_source, recursive=True)):
    wavf = files
    x, fs = sf.read(wavf)
    x = np.array(x, dtype=np.float)
    x = low_cut_filter(x, fs, cutoff=70)
    assert fs == 16000

    print("extract acoustic featuers: " + wavf)

    f0, sp, ap = feat.analyze(x)
    mcep = feat.mcep()
    npow = feat.npow()
    codeap = feat.codeap()
    
    src_f0list.append(f0)
    src_splist.append(sp)
    src_mceplist.append(mcep)
    src_aplist.append(ap)
    src_npowlist.append(npow)
    src_codeaplist.append(codeap)


tar_f0list = []
tar_mceplist = []
tar_aplist = []
tar_npowlist = []
tar_splist = []
tar_codeaplist = []

for files in sorted(glob.iglob(for_convert_target, recursive=True)):
    wavf = files
    x, fs = sf.read(wavf)
    x = np.array(x, dtype=np.float)
    x = low_cut_filter(x, fs, cutoff=70)
    assert fs == 16000

    print("extract acoustic featuers: " + wavf)

    f0, sp, ap = feat.analyze(x)
    mcep = feat.mcep()
    npow = feat.npow()
    codeap = feat.codeap()
    name, ext = os.path.splitext(wavf)

    tar_f0list.append(f0)
    tar_splist.append(sp)
    tar_mceplist.append(mcep)
    tar_aplist.append(ap)
    tar_npowlist.append(npow)
    tar_codeaplist.append(codeap)

In [None]:
f0stats = F0statistics()
srcf0stats = f0stats.estimate(org_f0list)
tarf0stats = f0stats.estimate(tar_f0list)

gv = GV()
srcgvstats = gv.estimate(org_mceplist)
targvstats = gv.estimate(tar_mceplist)

In [None]:
# 5. fitting target
epoch = 100

fitgmm = sklearn.mixture.GaussianMixture(n_components=sddim, covariance_type='full', max_iter=100)
fitgmm.weights_ = init_W
print(init_W.shape)
fitgmm.means_ = init_meanY
print(init_meanY.shape)
fitgmm.covariances_ = init_covYY
fitgmm.precisions_cholesky_ = _compute_precision_cholesky(init_covYY, 'full')
fitted_target = None

for i in range(len(tar_mceplist)):
    print("adapt: ", i+1, "/", len(tar_mceplist))
    target = tar_mceplist[i]
    target_pow = target[:, 0]
    target = target[:, 1:]
    for x in range(epoch):
        print("epoch = ", x)
        predict = fitgmm.predict_proba(np.atleast_2d(static_delta(target)))
        print(static_delta(target).shape, predict.shape, target.shape, biasvectors[1][0].shape, sddim)
        
        y = np.sum([predict[:, k: k + 1] * (static_delta(target) - biasvectors[1][k]) for k in range(n_mix)], axis = 1)
        gamma = np.sum(predict, axis = 0)

        left = np.sum([gamma[k] * np.dot(eigenvectors[1][k].T,
                                         np.linalg.solve(fitgmm.covariances_, eigenvectors[1])[k])
                       for k in range(n_mix)], axis=0)
        right = np.sum([np.dot(eigenvectors[1][k].T, 
                               np.linalg.solve(fitgmm.covariances_, y)[k]) 
                        for k in range(n_mix)], axis = 0)
        weight = np.linalg.solve(left, right)

        fitted_target = np.dot(eigenvectors[1], weight) + biasvectors[1]
        fitgmm.means_ = fitted_target

In [None]:
def mcepconvert(source, weights, jmean, meanX, covarXX, covarXY, covarYX, covarYY,
                fitted_source, fitted_target):
    """
    """
    M = 64
    # set_pX (like a GMMConverter) 
    px = sklearn.mixture.GaussianMixture(n_components=M, covariance_type='full')
    px.weights_ = weights
    px.means_ = meanX
    px.covariances_ = covarXX
    px.precisions_cholesky_ = _compute_precision_cholesky(covarXX, 'full')
    
    # set_Ab (like a GMMConverter)
    sddim = jmean.shape[1] // 2
    covXXinv = np.zeros((M, sddim, sddim))
    for m in range(M):
        covXXinv[m] = np.linalg.inv(covarXX[m])
    A = np.zeros((M, sddim, sddim))
    b = np.zeros((M, sddim))
    cond_cov_inv = np.zeros((M, sddim, sddim))
    for m in range(M):
        A[m] = covarYX[m] @ covXXinv[m]
        b[m] = fitted_target[m] - A[m] @ meanX[m]
        cond_cov_inv[m] = np.linalg.inv(covarYY[m] - A[m] @ covarXY[m])
    
    # _gmmmap
    T, sddim = source.shape
    wseq = px.predict_proba(source)
    cseq = np.argmax(wseq, axis=1)
    mseq = np.zeros((T, sddim))
    covseq = np.zeros((T, sddim, sddim))
    for t in range(T):
        m = cseq[t]
        mseq[t] = fitted_target[m] + A[m] @ (source[t] - meanX[m])
        covseq[t] = cond_cov_inv[m]
    
    # _mlpg
    T, sddim = mseq.shape
    W = construct_static_and_delta_matrix(T, sddim // 2)
    D = get_diagonal_precision_matrix(T, sddim, covseq)
    WD = W.T @ D
    WDW = WD @ W
    WDM = WD @ mseq.flatten()
    
    output = scipy.sparse.linalg.spsolve(WDW, WDM, use_umfpack=False).reshape(T, sddim // 2)
    
    return output

In [None]:
cvf0 = f0stats.convert(src_f0list[0], srcf0stats, tarf0stats)
cv_mceps = []
temp_mcep = src_mceplist[1]
temp_mcep_0th = temp_mcep[:, 0]
temp_mcep = temp_mcep[:, 1:]
sta_mcep = static_delta(temp_mcep)
cvmcep_wopow = np.array(mcepconvert(sta_mcep, init_W, init_jmean, init_meanX, init_covXX, 
                                   init_covXY, init_covYX, init_covYY, fitted_source, fitted_target))
cvmcep = np.c_[temp_mcep_0th, cvmcep_wopow]
cv_mceps.append(cvmcep)
temp_mcep = src_mceplist[2]
temp_mcep_0th = temp_mcep[:, 0]
temp_mcep = temp_mcep[:, 1:]
sta_mcep = static_delta(temp_mcep)
cvmcep_wopow = np.array(mcepconvert(sta_mcep, init_W, init_jmean, init_meanX, init_covXX, 
                                   init_covXY, init_covYX, init_covYY, fitted_source, fitted_target))
cvmcep = np.c_[temp_mcep_0th, cvmcep_wopow]
cv_mceps.append(cvmcep)
temp_mcep = src_mceplist[0]
temp_mcep_0th = temp_mcep[:, 0]
temp_mcep = temp_mcep[:, 1:]
sta_mcep = static_delta(temp_mcep)
cvmcep_wopow = np.array(mcepconvert(sta_mcep, init_W, init_jmean, init_meanX, init_covXX, 
                                   init_covXY, init_covYX, init_covYY, fitted_source, fitted_target))


In [None]:
cvmcep = np.c_[temp_mcep_0th, cvmcep_wopow]
cvgvstats = gv.estimate(cv_mceps)
cvmcep_wGV = gv.postfilter(cvmcep,targvstats,cvgvstats=cvgvstats)

In [None]:
temp_sm = pysptk.mc2sp(cvmcep, 0.42, 1024)
wav = pw.synthesize(cvf0, temp_sm, src_aplist[0], 16000, frame_period=5)
sf.write("temp.wav", wav, 16000)