In [83]:
import parselmouth
import pickle
import os
import numpy as np
from scipy.signal import decimate
import torch
import plla_tisvs.data as data
import plla_tisvs.model as model
import plla_tisvs.utils as utils
import plla_tisvs.testx as testx
import json
from plla_tisvs.estimate_alignment import optimal_alignment_path, compute_phoneme_onsets
from plla_tisvs.preprocessing_input import Custom_data_set

In [84]:
dict_path = "./plla_tisvs/dicts"
model_path = './plla_tisvs/trained_models/{}'.format("JOINT3")
phoneme_dict_path = "cmu_word2cmu_phoneme_extra.pickle"
audio_paths = ["E:/Speech_data_set/alignment_test/rolling_in_the_deep.wav"]
transcript_paths = ["E:/Speech_data_set/alignment_test/rolling_in_the_deep.txt"]

# parse data
data_parser = Custom_data_set(dict_path, phoneme_dict_path)
audio, phoneme_idx, phoneme_list_full, word_list = data_parser.parse(audio_paths[0], transcript_paths[0])

# ------------- remove this if it starts causing error ------------- 
# audio = torch.unsqueeze(audio, 0)
# audio = audio.tile((1, 2, 1))
# print(audio.shape)
# ------------- remove this if it starts causing error ------------- 

# load model
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print("Device:", device)
target = 'vocals'

# load model
model_to_test = testx.load_model(target, model_path, device)
model_to_test.return_alphas = True
model_to_test.eval()

# load model config
with open(os.path.join(model_path, target + '.json'), 'r') as stream:
    config = json.load(stream)
    samplerate = config['args']['samplerate']
    text_units = config['args']['text_units']
    nfft = config['args']['nfft']
    nhop = config['args']['nhop']

with torch.no_grad():
    vocals_estimate, alphas, scores = model_to_test((audio, phoneme_idx))

optimal_path_scores = optimal_alignment_path(scores, mode='max_numpy', init=200)

phoneme_onsets = compute_phoneme_onsets(optimal_path_scores, hop_length=nhop, sampling_rate=samplerate)

Device: cpu


In [85]:
phoneme_list = data_parser.get_phonemes(phoneme_idx[0])
length_of_list = len(phoneme_onsets) - 1

In [86]:
print(phoneme_list)

['$', 'W', '>', 'IY', '>', 'K', '>', 'UH', '>', 'D', '>', 'HH', '>', 'AE', '>', 'V', '>', 'HH', '>', 'AE', '>', 'D', '>', 'IH', '>', 'T', '>', 'AO', '>', 'L', '>', 'R', '>', 'OW', '>', 'L', '>', 'IH', '>', 'NG', '>', 'IH', '>', 'N', '>', 'DH', '>', 'AH', '>', 'D', '>', 'IY', '>', 'P', '>', 'Y', '>', 'UW', '>', 'HH', '>', 'AE', '>', 'D', '>', 'M', '>', 'AY', '>', 'HH', '>', 'AA', '>', 'R', '>', 'T', '>', 'IH', '>', 'N', '>', 'S', '>', 'AY', '>', 'D', '>', 'AH', '>', 'V', '>', 'Y', '>', 'AO', '>', 'R', '>', 'HH', '>', 'AE', '>', 'N', '>', 'D', '>', 'Z', '>', 'AH', '>', 'N', '>', 'D', '>', 'Y', '>', 'UW', '>', 'P', '>', 'L', '>', 'EY', '>', 'D', '>', 'IH', '>', 'T', '>', 'T', '>', 'UW', '>', 'DH', '>', 'AH', '>', 'B', '>', 'IY', '>', 'T', '$']


In [87]:
for i in range(1, length_of_list):
    print(phoneme_list[i], '\t' ,phoneme_onsets[i], phoneme_onsets[i+1])
    

W 	 0.016 0.112
> 	 0.112 0.128
IY 	 0.128 0.352
> 	 0.352 0.368
K 	 0.368 0.4
> 	 0.4 0.416
UH 	 0.416 0.848
> 	 0.848 0.864
D 	 0.864 0.912
> 	 0.912 0.928
HH 	 0.928 0.944
> 	 0.944 0.96
AE 	 0.96 1.024
> 	 1.024 1.04
V 	 1.04 1.056
> 	 1.056 1.072
HH 	 1.072 1.216
> 	 1.216 1.232
AE 	 1.232 1.28
> 	 1.28 1.296
D 	 1.296 1.312
> 	 1.312 1.344
IH 	 1.344 1.36
> 	 1.36 1.408
T 	 1.408 1.424
> 	 1.424 1.52
AO 	 1.52 5.616
> 	 5.616 5.632
L 	 5.632 5.648
> 	 5.648 5.664
R 	 5.664 5.84
> 	 5.84 5.888
OW 	 5.888 5.952
> 	 5.952 5.984
L 	 5.984 6.336
> 	 6.336 6.448
IH 	 6.448 6.608
> 	 6.608 6.624
NG 	 6.624 6.64
> 	 6.64 6.688
IH 	 6.688 9.28
> 	 9.28 9.296
N 	 9.296 9.408
> 	 9.408 9.424
DH 	 9.424 9.44
> 	 9.44 9.456
AH 	 9.456 9.504
> 	 9.504 9.52
D 	 9.52 9.712
> 	 9.712 9.728
IY 	 9.728 9.744
> 	 9.744 9.76
P 	 9.76 9.776
> 	 9.776 9.792
Y 	 9.792 9.808
> 	 9.808 9.824
UW 	 9.824 9.84
> 	 9.84 9.856
HH 	 9.856 9.872
> 	 9.872 9.888
AE 	 9.888 9.904
> 	 9.904 9.92
D 	 9.92 9.936
> 	 

In [88]:
import os
import textgrids

In [98]:
data_set_path = "E:/Speech_data_set/alignment_test"
output_path = "E:/Speech_data_set/alignment_test/i_dont_love_you"
dict_path = "C:/Users/evansamaa/Desktop/jali_sing/util/mfa_english_dict.txt"
command_context = "mfa align {} {} {} {}".format(data_set_path, dict_path, "english", output_path)

print(command_context)
os.system(command_context)

mfa align E:/Speech_data_set/alignment_test C:/Users/evansamaa/Desktop/jali_sing/util/mfa_english_dict.txt english E:/Speech_data_set/alignment_test/i_dont_love_you


0

In [90]:
grid = textgrids.TextGrid(output_path + "/child_in_time_1_for_mfa.TextGrid")

In [91]:
# print(grid.items())
# Get the phones
num_of_intervals = len(grid["phones"])
for i in range(0, num_of_intervals):
    print(grid["phones"][i])

<Interval text="" xmin=0.0 xmax=0.55>
<Interval text="S" xmin=0.55 xmax=0.83>
<Interval text="W" xmin=0.83 xmax=0.95>
<Interval text="IY1" xmin=0.95 xmax=1.22>
<Interval text="T" xmin=1.22 xmax=1.37>
<Interval text="CH" xmin=1.37 xmax=1.55>
<Interval text="AY1" xmin=1.55 xmax=1.93>
<Interval text="L" xmin=1.93 xmax=2.0>
<Interval text="D" xmin=2.0 xmax=2.07>
<Interval text="IH0" xmin=2.07 xmax=2.1>
<Interval text="N" xmin=2.1 xmax=2.18>
<Interval text="T" xmin=2.18 xmax=2.31>
<Interval text="AY1" xmin=2.31 xmax=2.85>
<Interval text="M" xmin=2.85 xmax=2.88>
<Interval text="Y" xmin=2.88 xmax=2.91>
<Interval text="UW1" xmin=2.91 xmax=2.94>
<Interval text="L" xmin=2.94 xmax=2.97>
<Interval text="S" xmin=2.97 xmax=3.0>
<Interval text="IY1" xmin=3.0 xmax=4.39>
<Interval text="" xmin=4.39 xmax=4.89>
<Interval text="DH" xmin=4.89 xmax=4.99>
<Interval text="AH1" xmin=4.99 xmax=5.96>
<Interval text="" xmin=5.96 xmax=5.99>
<Interval text="L" xmin=5.99 xmax=6.28>
<Interval text="AY1" xmin=6.28 xma

In [92]:
print(grid["words"])

[<Interval text="" xmin=0.0 xmax=0.55>, <Interval text="sweet" xmin=0.55 xmax=1.37>, <Interval text="child" xmin=1.37 xmax=2.07>, <Interval text="in" xmin=2.07 xmax=2.18>, <Interval text="time" xmin=2.18 xmax=2.88>, <Interval text="you'll" xmin=2.88 xmax=2.97>, <Interval text="see" xmin=2.97 xmax=4.39>, <Interval text="" xmin=4.39 xmax=4.89>, <Interval text="the" xmin=4.89 xmax=5.96>, <Interval text="" xmin=5.96 xmax=5.99>, <Interval text="line" xmin=5.99 xmax=8.41>, <Interval text="" xmin=8.41 xmax=9.05>, <Interval text="the" xmin=9.05 xmax=9.18>, <Interval text="" xmin=9.18 xmax=9.21>, <Interval text="line" xmin=9.21 xmax=9.87>, <Interval text="that's" xmin=9.87 xmax=10.09>, <Interval text="drawn" xmin=10.09 xmax=10.7>, <Interval text="between" xmin=10.7 xmax=12.31>, <Interval text="" xmin=12.31 xmax=13.23>, <Interval text="good" xmin=13.23 xmax=13.62>, <Interval text="and" xmin=13.62 xmax=15.6>, <Interval text="bad" xmin=15.6 xmax=16.28>, <Interval text="" xmin=16.28 xmax=16.77>, <I

In [93]:
def compute_word_alignment(phoneme_onsets, phoneme_list_full):
    word_durations = []
    pointer_i = 0 # this one is for the phoneme_list_full
    pointer_j = 0 # this one is for phoneme_onsets
    begin = phoneme_onsets[pointer_j]
    phone_copy = ['EOW'] + phoneme_list_full
    while pointer_j < len(phoneme_onsets):
        if phone_copy[pointer_i] == "EOW":
            word_durations.append([begin, phoneme_onsets[pointer_j]])
            if pointer_j + 1 == len(phoneme_onsets):
                break
            if phoneme_onsets[min(pointer_j + 1, len(phoneme_onsets)-1)] != "<":
                begin = phoneme_onsets[min(pointer_j + 1, len(phoneme_onsets)-1)]
                pointer_i = pointer_i + 2
                pointer_j = pointer_j + 1
            else:
                begin = phoneme_onsets[min(pointer_j + 2, len(phoneme_onsets)-1)]
                pointer_i = pointer_i + 3
                pointer_j = pointer_j + 2
        else:
            pointer_i = pointer_i + 1
            pointer_j = pointer_j + 1
    return word_durations[1:]

In [94]:
word_durations = compute_word_alignment(phoneme_onsets, phoneme_list_full)
print(phoneme_list)

['$', 'W', '>', 'IY', '>', 'K', '>', 'UH', '>', 'D', '>', 'HH', '>', 'AE', '>', 'V', '>', 'HH', '>', 'AE', '>', 'D', '>', 'IH', '>', 'T', '>', 'AO', '>', 'L', '>', 'R', '>', 'OW', '>', 'L', '>', 'IH', '>', 'NG', '>', 'IH', '>', 'N', '>', 'DH', '>', 'AH', '>', 'D', '>', 'IY', '>', 'P', '>', 'Y', '>', 'UW', '>', 'HH', '>', 'AE', '>', 'D', '>', 'M', '>', 'AY', '>', 'HH', '>', 'AA', '>', 'R', '>', 'T', '>', 'IH', '>', 'N', '>', 'S', '>', 'AY', '>', 'D', '>', 'AH', '>', 'V', '>', 'Y', '>', 'AO', '>', 'R', '>', 'HH', '>', 'AE', '>', 'N', '>', 'D', '>', 'Z', '>', 'AH', '>', 'N', '>', 'D', '>', 'Y', '>', 'UW', '>', 'P', '>', 'L', '>', 'EY', '>', 'D', '>', 'IH', '>', 'T', '>', 'T', '>', 'UW', '>', 'DH', '>', 'AH', '>', 'B', '>', 'IY', '>', 'T', '$']


In [96]:
# inputs 
phoneme_list # this is the list of all the phonemes
phoneme_onsets # this is the list of all the onsets

new_grid = textgrids.TextGrid() # initialize new_textgrid object

new_grid.xmin = 0
new_grid.xmax = phoneme_onsets[-1]
new_grid["phones"] = []
for i in range(1, len(phoneme_onsets) - 1):
    phoneme = phoneme_list[i]
    if phoneme == ">":
        phoneme = ""
    interval = textgrids.Interval(phoneme, phoneme_onsets[i], phoneme_onsets[i+1])
    new_grid["phones"].append(interval)
    
new_grid["words"] = []
for i in range(0, len(word_list)):
    interval = textgrids.Interval(word_list[i], word_durations[i][0], word_durations[i][1])
    new_grid["words"].append(interval)
new_grid.write(output_path + "/rolling_in_the_deep.TextGrid")

In [36]:
import norbert
import soundfile as sf

def istft(X, rate=44100, n_fft=4096, n_hopsize=1024):
    t, audio = scipy.signal.istft(
        X / (n_fft / 2),
        rate,
        nperseg=n_fft,
        noverlap=n_fft - n_hopsize,
        boundary=True
    )
    return audio

In [25]:
from plla_tisvs.preprocessing_input import Custom_data_set
from plla_tisvs import testx
import numpy as np

audio_path_file = "E:/MASC/voice_seperation_test/child_in_time_raw.wav"
transcript_path = "E:/MASC/voice_seperation_test/child_in_time_raw.txt"
dict_path = "./plla_tisvs/dicts"
model_path = './plla_tisvs/trained_models/{}'.format("JOINT3")
phoneme_dict_path = "cmu_word2cmu_phoneme_extra.pickle"
softmask = True
niter = 2
try:
    data_parser = Custom_data_set(dict_path, phoneme_dict_path)
except:
    dict_path = "." + dict_path
    data_parser = Custom_data_set(dict_path, phoneme_dict_path)
audio, phoneme_idx, phoneme_list_full, word_list = data_parser.parse(audio_path_file,
                                                                          transcript_path)


In [42]:
device = 'cpu'
target = 'vocals'
# load model
try:
    model_to_test = testx.load_model(target, model_path, device)
except:
    model_path = "." + model_path
    model_to_test = testx.load_model(target, model_path, device)
model_to_test.eval()
model_to_test.return_alphas = True
out = model_to_test((audio, phoneme_idx))
alphas = out[1].cpu().detach().numpy()
Vj = out[0].cpu().detach().numpy()

In [48]:
V = []
# output is nb_frames, nb_samples, nb_channels, nb_bins
V.append(Vj[:, 0, ...])  # remove sample dim
# source_names += [target]
V = np.transpose(np.array(V), (1, 3, 2, 0))

X = model_to_test.stft(audio).detach().cpu().numpy()
# convert to complex numpy type
X = X[..., 0] + X[..., 1] * 1j
X = X[0].transpose(2, 1, 0)

V = norbert.residual_model(V, X, 1)

Y = norbert.wiener(V, X.astype(np.complex128), niter,
                   use_softmask=True)

estimates = {}
for j, name in enumerate(["vocals"]):
    audio_hat = istft(
        Y[..., j].T,
        n_fft=model_to_test.stft.n_fft,
        n_hopsize=model_to_test.stft.n_hop
    )
    estimates[name] = audio_hat.T

In [49]:
sf.write(audio_path_file[:-4] + "_vocals.wav", estimates['vocals'], 16000)