In [1]:
import parselmouth
import pickle
import os
import numpy as np
from scipy.signal import decimate
import torch
import plla_tisvs.data as data
import plla_tisvs.model as model
import plla_tisvs.utils as utils
import plla_tisvs.testx as testx
import json
from plla_tisvs.estimate_alignment import optimal_alignment_path, compute_phoneme_onsets
from plla_tisvs.preprocessing_input import Custom_data_set

In [2]:
dict_path = "./plla_tisvs/dicts"
model_path = './plla_tisvs/trained_models/{}'.format("JOINT3")
phoneme_dict_path = "cmu_word2cmu_phoneme_extra.pickle"
audio_paths = ["/Volumes/EVAN_DISK/ten_videos/Child_in_time/Child_in_time_1/audio.wav"]
transcript_paths = ["/Volumes/EVAN_DISK/ten_videos/Child_in_time/Child_in_time_1/audio.txt"]

# parse data
data_parser = Custom_data_set(dict_path, phoneme_dict_path)
audio, phoneme_idx = data_parser.parse(audio_paths[0], transcript_paths[0])

# load model

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
print("Device:", device)
target = 'vocals'

# load model
model_to_test = testx.load_model(target, model_path, device)
model_to_test.return_alphas = True
model_to_test.eval()

# load model config
with open(os.path.join(model_path, target + '.json'), 'r') as stream:
    config = json.load(stream)
    samplerate = config['args']['samplerate']
    text_units = config['args']['text_units']
    nfft = config['args']['nfft']
    nhop = config['args']['nhop']

with torch.no_grad():
    vocals_estimate, alphas, scores = model_to_test((audio, phoneme_idx))

optimal_path_scores = optimal_alignment_path(scores, mode='max_numpy', init=200)

phoneme_onsets = compute_phoneme_onsets(optimal_path_scores, hop_length=nhop, sampling_rate=samplerate)

(2, 717507)
Device: cpu


  normalized, onesided, return_complex)


In [4]:
phoneme_list = data_parser.get_phonemes(phoneme_idx[0])    

In [5]:
print(phoneme_list)

['$', 'S', '>', 'W', '>', 'IY', '>', 'T', '>', 'CH', '>', 'AY', '>', 'L', '>', 'D', '>', 'IH', '>', 'N', '>', 'T', '>', 'AY', '>', 'M', '>', 'Y', '>', 'UW', '>', 'L', '>', 'S', '>', 'IY', '>', 'DH', '>', 'AH', '>', 'L', '>', 'AY', '>', 'N', '>', 'DH', '>', 'AH', '>', 'L', '>', 'AY', '>', 'N', '>', 'DH', '>', 'AE', '>', 'T', '>', 'S', '>', 'D', '>', 'R', '>', 'AO', '>', 'N', '>', 'B', '>', 'IH', '>', 'T', '>', 'W', '>', 'IY', '>', 'N', '>', 'G', '>', 'UH', '>', 'D', '>', 'AH', '>', 'N', '>', 'D', '>', 'B', '>', 'AE', '>', 'D', '>', 'S', '>', 'IY', '>', 'DH', '>', 'AH', '>', 'B', '>', 'L', '>', 'AY', '>', 'N', '>', 'D', '>', 'M', '>', 'AE', '>', 'N', '>', 'SH', '>', 'UW', '>', 'T', '>', 'IH', '>', 'NG', '>', 'AE', '>', 'T', '>', 'DH', '>', 'AH', '>', 'W', '>', 'ER', '>', 'L', '>', 'D', '>', 'B', '>', 'UH', '>', 'L', '>', 'AH', '>', 'T', '>', 'S', '>', 'F', '>', 'L', '>', 'AY', '>', 'IH', '>', 'NG', '>', 'OW', '>', 'T', '>', 'EY', '>', 'K', '>', 'IH', '>', 'NG', '>', 'T', '>', 'OW', '>', 