In [6]:
from util.SongDataStructure import Minimal_song_data_structure
from matplotlib import pyplot as plt
import numpy as np
from scipy.interpolate import interp1d
import json

In [7]:
from models.vowel_modification_detector import vowel_mod_detector
vowel_mod = vowel_mod_detector()

In [8]:
import librosa

# Input block

In [112]:
viseme_list, viseme_interval = [['Ah_pointer', 'Ah_pointer', 'Ah_pointer'], [[[-0.04454296135109409, 0], [0.0754570386489059, 8], [0.9172593342532398, 6.0], [1.3178600994546845, 0]], [[1.5561014036241034, 0], [1.6761014036241033, 8], [2.3056231203369095, 6.0], [2.6354636925745116, 0]], [[3.0884255822893834, 0], [3.2084255822893835, 8], [3.6622668199195463, 6.0], [3.9335472324629337, 0]]]]

audio_path = "C:/Users/evansamaa/Desktop/vowel_model_dataset/A_I_A_I_A_I.wav"
textgrid_path = "C:/Users/evansamaa/Desktop/vowel_model_dataset/A_I_A_I_A_I.TextGrid"
out_path = "C:/Users/evansamaa/Desktop/vowel_model_dataset/A_I_A_I_A_I.json"
spike_width = 0.5

In [113]:
vowel2Cardinal5 = {"Ah_pointer":0, "Aa_pointer":1, "Eh_pointer":1, "Ee_pointer":2, 
                 "Ih_pointer":2, "Oo_pointer":3, "Oh_pointer":3, "Uh_pointer":0, 
                  "U_pointer":4, "Eu_pointer":4}
vowel2Cardinal3 = {"Ah_pointer":0, "Aa_pointer":1, "Eh_pointer":1, "Ee_pointer":1, 
                 "Ih_pointer":1, "Oo_pointer":2, "Oh_pointer":2, "Uh_pointer":0, 
                  "U_pointer":2, "Eu_pointer":2}

control_direction_matrix_coarse = {0:{1:["Dimple", "Dimple", [0, 7]], 2:["Pucker", "Pucker", [0, 4]]},
                                  1:{0:["Pucker", "Pucker", [0, 1]], 2:["Pucker", "Pucker", [0, 4]]},
                                  2:{0:["self", "Lip Pucker", [0, -3]], 1:["self", "Lip Pucker", [0, -6],
                                                                          "Dimple", "Dimple", [0, 7]]}}
lyric = Minimal_song_data_structure(audio_path, "", txt_grid_path=textgrid_path)

In [126]:
modification_ctrl_pts = []
modification_sliders = []
# iterate through the vowels in the list
dt = 0.01
for i in range(0, len(viseme_list)):
    ##################################################################
    ###################### get the audio signal ######################
    ##################################################################
    vowel_mod_out, vowel_mod_out_coarse = vowel_mod(lyric.sound_arr_interp(np.arange(viseme_interval[i][1][0], 
                                                                                     viseme_interval[i][-1][0], 1.0/44100.0)))
    xs = np.linspace(viseme_interval[i][1][0], viseme_interval[i][-1][0], vowel_mod_out_coarse.shape[0])
    coarse_vowel_sounds_like_interp = interp1d(xs, vowel_mod_out_coarse, axis=0)
    
    # what the original sound was
    original_vowel_shape = vowel2Cardinal3[viseme_list[i]]
    only_peaks = np.where(vowel_mod_out_coarse > 0.7, vowel_mod_out_coarse, original_vowel_shape)
    vowel_sounds_like = np.argmax(only_peaks, axis=1)
    ##################################################################
    ### obtain the intervals of which cardinal vowels are dominant ###
    ##################################################################
    cardinal_list = []
    cardinal_intervals = []
    current_interval_start = 0
    current_vowel = original_vowel_shape
    for t in range(0, vowel_sounds_like.shape[0]):
        if vowel_sounds_like[t] == current_vowel:
            if (t == vowel_sounds_like.shape[0]-1):
                cardinal_list.append(current_vowel)
                cardinal_intervals.append([current_interval_start, t])
        else:
            if xs[t-1] - xs[current_interval_start] >= 0.2:
                cardinal_list.append(current_vowel)
                cardinal_intervals.append([current_interval_start, t-1])
                current_interval_start = t
                current_vowel = vowel_sounds_like[t]
    ###########################################################################
    ######### optionally additional smoothing are added to this here ##########      
    ###########################################################################
    cardinal_list_new = []
    cardinal_intervals_new = []
    j = 0
    while j < len(cardinal_list):
        step = 1
        if j == len(cardinal_list) - 1:
            cardinal_list_new.append(cardinal_list[j])
            cardinal_intervals_new.append(cardinal_intervals[j])
        elif cardinal_list[j] == cardinal_list[j+1] and xs[cardinal_intervals[j+1][0]] - xs[cardinal_intervals[j][1]] <= spike_width:
            cardinal_list_new.append(cardinal_list[j])
            cardinal_intervals_new.append([cardinal_intervals[j][0], cardinal_intervals[j+1][1]])
            step = 2
        elif j < len(cardinal_list) - 2:
            if (cardinal_list[j] == cardinal_list[j+2] and xs[cardinal_intervals[j+2][0]] - xs[cardinal_intervals[j][1]] <= spike_width 
                and cardinal_list[j+1] == original_vowel_shape):
                cardinal_list_new.append(cardinal_list[j])
                cardinal_intervals_new.append([cardinal_intervals[j][0], cardinal_intervals[j+2][1]])
                step = 3
        else:
            cardinal_list_new.append(cardinal_list[j])
            cardinal_intervals_new.append(cardinal_intervals[j])
        j = j + step
    cardinal_list = cardinal_list_new
    cardinal_intervals = cardinal_intervals_new
    
    
    # now set pucker/stretch values based on the detected sound
    for c in range(0, len(cardinal_list)):
        if original_vowel_shape == cardinal_list[c] or cardinal_list[c] == 3:
            continue
        else:
            max_prob = coarse_vowel_sounds_like_interp(xs[cardinal_intervals[c][0]:cardinal_intervals[c][1]+1])[:, cardinal_list[c]].max()
            slider_ct_pts = control_direction_matrix_coarse[original_vowel_shape][cardinal_list[c]]
            for s in range(0, int(len(slider_ct_pts)/3)):
                # add a starting keyframe and ending keyframe
                modification_sliders.append([slider_ct_pts[0 + 3*s], slider_ct_pts[1 + 3*s]])
                # the start of this curve should be earlier, e.g. at 75% of the previous interval
                if c == 0:
                    start_candidate = xs[cardinal_intervals[c][0]]-0.14
                    start = max(start_candidate, xs[cardinal_intervals[c-1][0]])
                else:
                    start_candidate = (xs[cardinal_intervals[c-1][1]] - xs[cardinal_intervals[c-1][0]]) * 0.6 + xs[cardinal_intervals[c-1][0]]
                    start_candidate = min(start_candidate, xs[cardinal_intervals[c-1][1]]-0.12)
                    start = max(start_candidate, xs[cardinal_intervals[c-1][0]])
                modification_ctrl_pts.append([start, 0])
                modification_sliders.append([slider_ct_pts[0 + 3*s], slider_ct_pts[1 + 3*s]])
                if c == len(cardinal_list) - 1:
                    modification_ctrl_pts.append([xs[cardinal_intervals[c][1]], 0])
                else:
                    modification_ctrl_pts.append([viseme_interval[-1][0], 0])        
                # add the peaks in the middle with the decay
                slider_range = slider_ct_pts[2 + 3*s]
                modification_sliders.append([slider_ct_pts[0 + 3*s], slider_ct_pts[1 + 3*s]])
                modification_ctrl_pts.append([xs[cardinal_intervals[c][0]]-0.13, max_prob * (slider_range[1])])
                modification_sliders.append([slider_ct_pts[0 + 3*s], slider_ct_pts[1 + 3*s]])
                end_p75 = (xs[cardinal_intervals[c][1]] - xs[cardinal_intervals[c][0]]) * 0.75 + xs[cardinal_intervals[c][0]]
                modification_ctrl_pts.append([end_p75, max_prob * (slider_range[1]) * 0.75])
        

In [127]:
output ={"viseme":[viseme_list, viseme_interval],
#         "brow":[brow_movement, brow_ctrl_points, finer_brow_raise_ctrl_points, finer_brow_furrow_ctrl_points],
#         "blink":[eye_movement, eye_ctrl_points],
        "vowel_mod": [modification_sliders, modification_ctrl_pts],
        "jaw":[[0, 6]],
        "lip":[[0, 6]]}
jsonoutput = json.dumps(output)
with open(out_path, 'w') as outfile:
    json.dump(jsonoutput, outfile)