In [1]:
VOICED = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW', "ER"])
CMU2VISEME = {"AA":"Ah", "AO":"Ah", "AY":"Ah", "AW":"Ah","AE":"Aa",
              "EY":"Aa","UH":"Uh", "UW":"U","IH": "Ih","IY": "Ih","EH": "Eh","HH": "Eh","UH": "Eh","AH": "Eh",
              "ER": "Eh","OW":"Oo","OY":"Oh","R":"R","D":"LNTD","T": "LNTD","L":"LNTD","N":"LNTD","NG":"LNTD",
              "F":"FV","V":"FV","B":"BP","M":"M","P":"BP","CH":"ShChZh","SH":"ShChZh","ZH":"ShChZh",
              "S": "SZ", "Z": "SZ","DH":"Th", "TH":"Th","G":"GK", "K":"GK","Y":"Y","JH":"J","W":"W",}
VOWELS_SLIDERS_JALI = set(['Ih_pointer', 'Ee_pointer', 'Eh_pointer', 'Aa_pointer', 'U_pointer', 'Uh_pointer'
                           , 'Oo_pointer', 'Oh_pointer', 'Schwa_pointer', 'Eu_pointer', "Ah_pointer"])
CONSONANTS_SLIDERS_JALI = set(["M_pointer", "BP_pointer", "JY_pointer", "Th_pointer", "ShChZh_pointer", "SZ_pointer", "GK_pointer", "LNTD_pointer", "R_pointer", "W_pointer", "FV_pointer"])
CONSONANTS_SLIDERS_NOJAW_JALI = set(["Ya_pointer", "Ja_pointer", "Ra_pointer", "FVa_pointer", "LNTDa_pointer", "Ma_pointer", "BPa_pointer", "Wa_pointer", "Tha_pointer", "GKa_pointer"])
JALI_SLIDERS_SET = set.union(VOWELS_SLIDERS_JALI, CONSONANTS_SLIDERS_JALI, CONSONANTS_SLIDERS_NOJAW_JALI)

In [2]:
class CMU_phonemes_dicts():
    def __init__(self):
        self.vocabs = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G',
                  'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH',
                  'UW', 'V', 'W', 'Y', 'Z', 'ZH'])
        self.vowels = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 
                  'IH', 'IY', 'OW', 'OY', 'UH', 'UW', ])
        self.voiced = set(['M', 'N']).union(self.vowels)
        self.consonants = set(['B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 
                              'P', 'R', 'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'])
        self.consonants_no_jaw = self.consonants
        self.lip_closer = set(["B", "F", "M", "P", "S", "V"])
        self.lip_rounder = set(["B", "F", "M", "P", "V"])
        self.nasal_obtruents = set(['L', 'N', 'NG', 'T', 'D', 'G', 'K', 'F', 'V', 'M', 'B', 'P'])
        self.fricative = set(["S", "Z", "ZH", "SH", "CH", "F", "V", 'TH'])
        self.plosive = set(["P", "B", "D", "T", "K", "G"])
        self.lip_heavy = set(["W", "OW", "UW", "S", "Z", "Y", "JH", "OY"])
        self.sibilant = set(["S", "Z", "SH", "CH", "ZH"])
class JALI_visemes_dicts():
     def __init__(self):
        self.vowels = set(['Ih_pointer', 'Ee_pointer', 'Eh_pointer', 'Aa_pointer', 'U_pointer', 'Uh_pointer'
                           , 'Oo_pointer', 'Oh_pointer', 'Schwa_pointer', 'Eu_pointer', "Ah_pointer"])
        self.voiced = set(['Ih_pointer', 'Ee_pointer', 'Eh_pointer', 'Aa_pointer', 'U_pointer', 'Uh_pointer'
                           , 'Oo_pointer', 'Oh_pointer', 'Schwa_pointer', 'Eu_pointer', "Ah_pointer", "LNTD_pointer", "LNTDa_pointer"])
        self.consonants_no_jaw = set(["Ya_pointer", "Ja_pointer", "Ra_pointer", "FVa_pointer", "LNTDa_pointer", "Ma_pointer", "BPa_pointer", "Wa_pointer", "Tha_pointer", "GKa_pointer"])
        self.consonants = set(["M_pointer", "BP_pointer", "JY_pointer", "Th_pointer", "ShChZh_pointer", "SZ_pointer", "GK_pointer", "LNTD_pointer", "R_pointer", "W_pointer", "FV_pointer"]) 
        self.lip_closer = set(["M_pointer", "BP_pointer", "FV_pointer", "SZ_pointer"])
        self.lip_rounder = set(["M_pointer", "BP_pointer", "FV_pointer"])
        self.vocabs = self.consonants.union(self.vowels).union(self.consonants_no_jaw)
        self.sibilant = set(["SZ_pointer", "ShChZh_pointer"])
        self.nasal_obtruents = set(["LNTD_pointer", "GK_pointer", "FV_pointer", "M_pointer", "BP_pointer"])
        self.fricative = set(["FV_pointer", "SZ_pointer", "ShChZh_pointer", "Th_pointer"])
        self.plosive = set(["BP_pointer", "LNTDa_pointer", "GK_pointer"])
        self.lip_heavy = set(["Oh_pointer", "W_pointer", "Wa_pointer", "U_pointer", "SZ_pointer", "JY_pointer",
                             "Ya_pointer", "Ja_pointer"])
        self.lip_rounder_to_no_jaw_dict = {"M_pointer":"Ma_pointer", "BP_pointer":"BPa_pointer", "FV_pointer":"FVa_pointer"}
cmu_sets = CMU_phonemes_dicts()
jali_sets = JALI_visemes_dicts()

In [3]:
def generate_basic_viseme_curve(start, end, value, sustain=1, decay = 0.75, onset=0.1, offset=0):
    if end - start < 0.1:
        end = start + 0.1
    interval = []
    interval.append([start-onset, 0])
    # second point is when the belting starts 
    interval.append([start, 1 * value])
    # third point emphasizes decay, it happens 75% down the interval
    if sustain < 1:
        interval.append([start + sustain * (end - start), decay * value])
        # last point is where the furrowing ends
        interval.append([end+offset, 0])
    elif sustain == 1:
        interval.append([end, value])
        # last point is where the furrowing ends
        interval.append([end+offset, 0])
    return interval
def get_kth_neighbour(input_list, i, k):
    if i+k < 0 or i+k >= len(input_list):
        return None
    return input_list[i+k]

In [4]:
from util.SongDataStructure import *
from util.pitch_interval_estimation import *
import numpy as np
import json
from matplotlib import pyplot as plt
import os
from scipy.interpolate import interp1d

In [5]:
# load file for Jali
dir = "E:/MASC/Structured_data/rolling_in_the_deep_adele"
file_name_template = "audio"
lyric = Minimal_song_data_structure(os.path.join(dir, file_name_template+".wav"), os.path.join(dir, file_name_template+".txt"),
                                                                                             os.path.join(dir, "audio_full.TextGrid"))

PraatError: Cannot open file “E:\MASC\Structured_data\rolling_in_the_deep_adele\audio.wav”.
Sound not read from sound file “E:\MASC\Structured_data\rolling_in_the_deep_adele\audio.wav”.

In [6]:
dir = "E:/MASC/Structured_data/let_her_go"
file_name_template = "vocal_audio_1"
lyric = Minimal_song_data_structure(os.path.join(dir, file_name_template+".wav"), os.path.join(dir, file_name_template+".txt"),
                                   os.path.join(dir, "vocal_audio_1_full.TextGrid"))

In [7]:
dir = "E:/MASC/Structured_data/my_way_frank_sinatra"
file_name_template = "audio_1"
lyric = Minimal_song_data_structure(os.path.join(dir, file_name_template+".wav"), os.path.join(dir, file_name_template+".txt"),
                                   os.path.join(dir, "audio_1__full.TextGrid"))

In [62]:
dir = "C:/Users/evansamaa/Desktop/vowel_model_dataset"
file_name_template = "A_I_A_I_A_I"
lyric = Minimal_song_data_structure(os.path.join(dir, file_name_template+".wav"), os.path.join(dir, file_name_template+".txt"),
                                   os.path.join(dir, "A_I_A_I_A_I.TextGrid"))

In [63]:
# break the thing into sentence structures
sentences = []
current_sentence = []
for i in range(0, len(lyric.phoneme_list)):
    if lyric.phoneme_list[i] == "EOS_tag":
        sentences.append(current_sentence)
        current_sentence = []
    else:
        current_sentence.append(i)
        if i == len(lyric.phoneme_list) - 1:
            sentences.append(current_sentence)
sentences = sentences[1:] 
if len(sentences) == 0:
    sentences = [list(range(0, len(lyric.phoneme_list)))]
# sentence stores the indexes

In [64]:
phoneme_list = lyric.phoneme_list
phoneme_interval = lyric.phoneme_intervals

In [65]:
phoneme_list

['>', 'AY', '>', 'AY', '>', 'AY', '>']

In [66]:
# using tanh function to get a smooth curve
def Viseme_A(peak=None, lowest=None):
    if not peak is None:
        total = np.log((1+peak)/(1-peak))/2
        b = np.log((1+lowest)/(1-lowest))/2
        a = total-b
    else:
        peak = 0.99
        lowest = 0.6
        total = np.log((1+peak)/(1-peak))/2
        b = np.log((1+lowest)/(1-lowest))/2
        a = total-b
    def fn(val, val_max, val_min, max_val = 10):
        val = (val - val_min) / (val_max - val_min)
        print(val)
#         return (lowest + val * ((peak - lowest)))*max_val
        return np.tanh((val)*a+b) * max_val
#         return (np.exp(val * 8)/np.exp(8) * (peak-lowest) + lowest) * max_val
    return fn
viseme_A = Viseme_A()
# plt.plot(np.arange(0, 1, 0.01), viseme_A(np.arange(0, 1, 0.01), 1, 0))

In [67]:
SIBLANT_JALI = set(["SZ", "ShChZh"])
VOWELS_JALI = set(['Ih', 'Ee', 'Eh', 'Aa', 'U', 'Uh', 'Oo', 'Oh', 'Schwa', 'Eu', "Ah"])
NASAL_OBSTRUENTS_JALI = set(["LNTD", "GK", "FV", "M", "BP"])

# animate only vowels and see how it goes
viseme_list = []
viseme_intervals = []
pure_phoneme_list = []
max_activation = 8
# threshold_slope = 200 # for formants
threshold_slope = 80 # for pitch


for i in range(0, len(phoneme_list)):
    print(phoneme_list[i])
    if phoneme_list[i] != "EOS_tag" and phoneme_list[i] != ">":
        onset = 0.12
        offset = 0.12
        if CMU2VISEME[phoneme_list[i]] in VOWELS_JALI or CMU2VISEME[phoneme_list[i]] in SIBLANT_JALI:
            viseme_jali = CMU2VISEME[phoneme_list[i]] + "_pointer"
        else :
            if CMU2VISEME[phoneme_list[i]] in NASAL_OBSTRUENTS_JALI and phoneme_interval[i][1] - phoneme_interval[i][0] > 1/20:
                viseme_jali = CMU2VISEME[phoneme_list[i]] + "_pointer"
            else:
                viseme_jali = CMU2VISEME[phoneme_list[i]] + "a_pointer"
        if viseme_jali in jali_sets.lip_heavy:
            onset = 0.16
            offset = 0.16
        start = phoneme_interval[i][0]
        end = phoneme_interval[i][1]
        if (end - start) <= 0.1:
            value = 6
            sustain = 0.75
            decay = 0.75
        elif (end - start) <= 0.3:
            value = 6
            sustain = 0.75
            decay = 0.75
        else:
            value = 8
            sustain = 0.75
            decay = 0.75
        if phoneme_list[i] in cmu_sets.lip_closer:
            value = 10
        viseme_curve = generate_basic_viseme_curve(start, end, value, sustain=sustain, decay=decay, onset=onset, offset=offset)
        viseme_list.append(viseme_jali)
        pure_phoneme_list.append(phoneme_list[i])
        viseme_intervals.append(viseme_curve)
    else:
        continue
    

>
AY
>
AY
>
AY
>


In [68]:
# enforce co-articulation rules of consonants
viseme_list_final = []
viseme_intervals_final = []
i = 0;

while i < len(viseme_list):
    increment = 1
    i_next = min(i + 1, len(viseme_list)-1)
    if (viseme_list[i_next] == viseme_list[i] and viseme_intervals[i][-1][0] >= viseme_intervals[i_next][0][0]):
        # remove repeated vowels or consonants
        viseme_list_final.append(viseme_list[i_next])
        int_curr = viseme_intervals[i]
        int_next = viseme_intervals[i_next]
        viseme_interval = [int_curr[0], [int_curr[1][0], max(int_curr[1][1], int_next[1][1])], 
                           [int_next[2][0], max(int_curr[2][1], int_next[2][1])], int_next[3]]
        viseme_intervals_final.append(viseme_interval)
        if viseme_list[i_next] in jali_sets.lip_rounder:
            viseme_list_final.append(jali_sets.lip_rounder_no_jaw_dict[viseme_list[i_next]])
            viseme_intervals_final.append(viseme_interval)
        increment = 2
    elif viseme_list[i] in jali_sets.lip_heavy:
        # if the viseme is a lip-heavy viseme, the it is voice simutaneously as nearby labial dental and bilabials 
        current_interval = viseme_intervals[i] 
        if not get_kth_neighbour(viseme_list, i, -1) is None:
            if current_interval[0][0] <= viseme_intervals[i-1][-1][0] - lyric.dt and viseme_intervals[i-1][-1][0] in jali_sets.lip_rounder:
                current_interval[0][0] = viseme_intervals[i-1][0][0]
                current_interval[1][0] = viseme_intervals[i-1][1][0]
        if not get_kth_neighbour(viseme_list, i, +1) is None:
            if current_interval[-1][0] <= viseme_intervals[i+1][0][0] - lyric.dt and viseme_intervals[i+1][-1][0] in jali_sets.lip_rounder:
                current_interval[2][0] = viseme_intervals[i+1][0][0]
                current_interval[3][0] = viseme_intervals[i+1][1][0]
        viseme_list_final.append(viseme_list[i])
        viseme_intervals_final.append(current_interval)
        if viseme_list[i] in jali_sets.lip_rounder:
            viseme_list_final.append(jali_sets.lip_rounder_to_no_jaw_dict[viseme_list[i]])
            viseme_intervals_final.append(current_interval)
    else:
        viseme_list_final.append(viseme_list[i])
        viseme_intervals_final.append(viseme_intervals[i])
        if viseme_list[i] in jali_sets.lip_rounder:
            viseme_list_final.append(jali_sets.lip_rounder_to_no_jaw_dict[viseme_list[i]])
            viseme_intervals_final.append(viseme_intervals[i])
    i = i + increment

In [69]:
# pass 3
# set this up
prev_slider_dict = {}
for i in range(0, len(list(jali_sets.vocabs))):
    prev_slider_dict[list(jali_sets.vocabs)[i]] = -1
viseme_list_final_final = []
viseme_intervals_final_final = []
i = 0  
while i < len(viseme_list_final):
    increment = 1
    prev_viseme = viseme_list_final[i]
    # if the previous instance of the current viseme is not -1
    if prev_slider_dict[viseme_list_final[i]] != -1:
        current_interval = viseme_intervals_final[i]
        prev_interval = viseme_intervals_final_final[prev_slider_dict[viseme_list_final[i]]]
        if (current_interval[1][0] >= prev_interval[2][0] and current_interval[0][0] <= prev_interval[3][0]):
            interval = prev_interval[:-1] + current_interval[1:]
            viseme_intervals_final_final[prev_slider_dict[viseme_list_final[i]]] = interval
        elif (current_interval[1][0] <= prev_interval[2][0]):
            interval = prev_interval[0:-2] + current_interval[1:]
            viseme_intervals_final_final[prev_slider_dict[viseme_list_final[i]]] = interval
        else:
            viseme_list_final_final.append(viseme_list_final[i])
            viseme_intervals_final_final.append(viseme_intervals_final[i])
                
    else:        
        viseme_list_final_final.append(viseme_list_final[i])
        viseme_intervals_final_final.append(viseme_intervals_final[i])
        
    prev_slider_dict[viseme_list_final[i]] = len(viseme_list_final_final) - 1
    i = i + increment

In [70]:
lyric.compute_self_vibrato_intervals()
vib_ctrl_pts = []
for k in lyric.vibrato_intervals:
    if len(k) > 0:
        for m in k:
            vib_ctrl_pts.append(m)


In [71]:
print([viseme_list_final_final, viseme_intervals_final_final])

[['Ah_pointer', 'Ah_pointer', 'Ah_pointer'], [[[-0.04454296135109409, 0], [0.0754570386489059, 8], [0.9172593342532398, 6.0], [1.3178600994546845, 0]], [[1.5561014036241034, 0], [1.6761014036241033, 8], [2.3056231203369095, 6.0], [2.6354636925745116, 0]], [[3.0884255822893834, 0], [3.2084255822893835, 8], [3.6622668199195463, 6.0], [3.9335472324629337, 0]]]]


In [58]:
output ={"viseme":[viseme_list_final_final, viseme_intervals_final_final],
#         "brow":[brow_movement, brow_ctrl_points, finer_brow_raise_ctrl_points, finer_brow_furrow_ctrl_points],
#         "blink":[eye_movement, eye_ctrl_points],
        "vowel_mod": vowel_mod_arr
        "jaw":[[0, 6]],
        "lip":[[0, 6]], 
        "vib":vib_ctrl_pts}
jsonoutput = json.dumps(output)
with open(os.path.join(dir, file_name_template+'_raw_jali.json'), 'w') as outfile:
    json.dump(jsonoutput, outfile)

SyntaxError: invalid syntax (Temp/ipykernel_10260/418142628.py, line 5)