In [1]:
import parselmouth
import pickle
import os
import numpy as np
from scipy.signal import decimate
import torch
import plla_tisvs.data as data
import plla_tisvs.model as model
import plla_tisvs.utils as utils
import plla_tisvs.testx as testx
import json
from plla_tisvs.estimate_alignment import optimal_alignment_path, compute_phoneme_onsets
from plla_tisvs.preprocessing_input import Custom_data_set

In [2]:
dict_path = "./plla_tisvs/dicts"
model_path = './plla_tisvs/trained_models/{}'.format("JOINT3")
phoneme_dict_path = "cmu_word2cmu_phoneme_extra.pickle"
audio_paths = ["/Volumes/EVAN_DISK/ten_videos/Child_in_time/Child_in_time_1/audio.wav"]
transcript_paths = ["/Volumes/EVAN_DISK/ten_videos/Child_in_time/Child_in_time_1/audio.txt"]

# parse data
data_parser = Custom_data_set(dict_path, phoneme_dict_path)
audio, phoneme_idx = data_parser.parse(audio_paths[0], transcript_paths[0])

# load model

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print("Device:", device)
target = 'vocals'

# load model
model_to_test = testx.load_model(target, model_path, device)
model_to_test.return_alphas = True
model_to_test.eval()

# load model config
with open(os.path.join(model_path, target + '.json'), 'r') as stream:
    config = json.load(stream)
    samplerate = config['args']['samplerate']
    text_units = config['args']['text_units']
    nfft = config['args']['nfft']
    nhop = config['args']['nhop']

with torch.no_grad():
    vocals_estimate, alphas, scores = model_to_test((audio, phoneme_idx))

optimal_path_scores = optimal_alignment_path(scores, mode='max_numpy', init=200)

phoneme_onsets = compute_phoneme_onsets(optimal_path_scores, hop_length=nhop, sampling_rate=samplerate)

(2, 717507)
Device: cpu


  normalized, onesided, return_complex)


In [10]:
phoneme_list = data_parser.get_phonemes(phoneme_idx[0])
length_of_list = len(phoneme_onsets) - 1

In [11]:
print(phoneme_list)

['$', 'S', '>', 'W', '>', 'IY', '>', 'T', '>', 'CH', '>', 'AY', '>', 'L', '>', 'D', '>', 'IH', '>', 'N', '>', 'T', '>', 'AY', '>', 'M', '>', 'Y', '>', 'UW', '>', 'L', '>', 'S', '>', 'IY', '>', 'DH', '>', 'AH', '>', 'L', '>', 'AY', '>', 'N', '>', 'DH', '>', 'AH', '>', 'L', '>', 'AY', '>', 'N', '>', 'DH', '>', 'AE', '>', 'T', '>', 'S', '>', 'D', '>', 'R', '>', 'AO', '>', 'N', '>', 'B', '>', 'IH', '>', 'T', '>', 'W', '>', 'IY', '>', 'N', '>', 'G', '>', 'UH', '>', 'D', '>', 'AH', '>', 'N', '>', 'D', '>', 'B', '>', 'AE', '>', 'D', '>', 'S', '>', 'IY', '>', 'DH', '>', 'AH', '>', 'B', '>', 'L', '>', 'AY', '>', 'N', '>', 'D', '>', 'M', '>', 'AE', '>', 'N', '>', 'SH', '>', 'UW', '>', 'T', '>', 'IH', '>', 'NG', '>', 'AE', '>', 'T', '>', 'DH', '>', 'AH', '>', 'W', '>', 'ER', '>', 'L', '>', 'D', '>', 'B', '>', 'UH', '>', 'L', '>', 'AH', '>', 'T', '>', 'S', '>', 'F', '>', 'L', '>', 'AY', '>', 'IH', '>', 'NG', '>', 'OW', '>', 'T', '>', 'EY', '>', 'K', '>', 'IH', '>', 'NG', '>', 'T', '>', 'OW', '>', 

In [14]:
for i in range(1, length_of_list):
    print(phoneme_list[i], '\t' ,phoneme_onsets[i], phoneme_onsets[i+1])

S 	 1.728 1.744
> 	 1.744 1.76
W 	 1.76 1.776
> 	 1.776 1.936
IY 	 1.936 4.976
> 	 4.976 7.168
T 	 7.168 7.184
> 	 7.184 7.2
CH 	 7.2 7.632
> 	 7.632 8.08
AY 	 8.08 8.128
> 	 8.128 8.16
L 	 8.16 10.144
> 	 10.144 12.08
D 	 12.08 12.096
> 	 12.096 12.912
IH 	 12.912 13.44
> 	 13.44 13.456
N 	 13.456 13.744
> 	 13.744 13.76
T 	 13.76 13.904
> 	 13.904 14.112
AY 	 14.112 14.64
> 	 14.64 16.368
M 	 16.368 16.384
> 	 16.384 18.4
Y 	 18.4 18.992
> 	 18.992 19.456
UW 	 19.456 21.472
> 	 21.472 23.376
L 	 23.376 23.392
> 	 23.392 23.488
S 	 23.488 23.872
> 	 23.872 23.888
IY 	 23.888 24.0
> 	 24.0 24.016
DH 	 24.016 24.032
> 	 24.032 24.128
AH 	 24.128 24.672
> 	 24.672 24.832
L 	 24.832 24.976
> 	 24.976 24.992
AY 	 24.992 25.264
> 	 25.264 25.296
N 	 25.296 25.328
> 	 25.328 25.344
DH 	 25.344 25.36
> 	 25.36 25.6
AH 	 25.6 27.424
> 	 27.424 27.44
L 	 27.44 27.6
> 	 27.6 28.768
AY 	 28.768 28.784
> 	 28.784 28.8
N 	 28.8 28.816
> 	 28.816 28.832
DH 	 28.832 28.848
> 	 28.848 28.912
AE 	 28.9