In [1]:
from util.sound_processing import *

In [2]:
import numpy as np
import json
from matplotlib import pyplot as plt
import os
from scipy.interpolate import interp1d

VOWELS = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW', "ER", "N"])

In [3]:
dir = "E:/Structured_data/rolling_in_the_deep_adele"
file_name_template = "audio"
lyric = combine_lyric_alignment_textgrids(dir, file_name_template)

FileNotFoundError: [Errno 2] No such file or directory: 'E:/Structured_data/rolling_in_the_deep_adele/temp'

In [None]:
lyric.compute_self_pitch_intervals()
lyric.compute_self_vibrato_intervals()
lyric.compute_self_singing_style_intervals()

In [None]:
def generate_animation_ctrl_pts(start, end, value, sustain=1, decay = 0.75, onset=0.1, offset=0):
    interval = []
    interval.append([start-onset, 0])
    # second point is when the belting starts 
    interval.append([start, 1 * value])
    # third point emphasizes decay, it happens 75% down the interval
    if sustain < 1:
        interval.append([start + sustain * (end - start), decay * value])
        # last point is where the furrowing ends
        interval.append([end+offset, 0])
    elif sustain == 1:
        interval.append([end, value])
        # last point is where the furrowing ends
        interval.append([end+offset, 0])
    return interval

## 2. Compute coarse intervals from fine ones

In [15]:
intervals = lyric.voice_quality_intervals
traits = lyric.voice_quality_lists
def compute_coarse_intervals(traits, interval):
    new_intervals = []
    new_traits = []
    for i in range(0, len(intervals)):
        new_interval = []
        new_trait = []
        interval = intervals[i]
        trait = traits[i]
        if len(trait) > 1:
            prev_trait = trait[0]
            prev_index = 0
            for k in range(1, len(trait)):
                if trait[k] == prev_trait and k == len(trait)-1:
                    new_trait.append(prev_trait)
                    new_interval.append([interval[prev_index][0], interval[k][1]])
                elif trait[k] == prev_trait:
                    continue
                elif trait[k] != prev_trait:
                    new_trait.append(prev_trait)
                    new_interval.append([interval[prev_index][0], interval[k-1][1]])
                    prev_trait = trait[k]
            new_traits.append(new_trait)
            new_intervals.append(new_interval)
        else:
            new_traits.append(trait)
            new_intervals.append(interval)
    return new_traits, new_intervals
new_traits, new_interval = compute_coarse_intervals(traits, interval)

## 1. Compute Eye Brow Movements

In [16]:
# break the thing into sentence structures
sentences = []
current_sentence = []
for i in range(0, len(lyric.phoneme_list)):
    if lyric.phoneme_list[i] == "EOS_tag":
        sentences.append(current_sentence)
        current_sentence = []
    else:
        current_sentence.append(i)
sentences = sentences[1:] 
# sentence stores the indexes

In [44]:
brow_movement = []
brow_intervals = []
for i in range(0, len(sentences)):
    sentence = sentences[i]
    has_belt_pitch_interval_id = -1
    has_head_pitch_interval_id = -1
    has_belt_word_id = -1
    has_head_word_id = -1
    for phone_id in sentence:
        phone = lyric.phoneme_list[phone_id]
        voice_qualities = lyric.coarse_voice_quality_lists[phone_id]
        voice_intervals = lyric.coarse_voice_quality_intervals[phone_id]
        # look to see if there are any parts that has belting
        for voice_quality_id in range(0, len(voice_qualities)):
            if (voice_qualities[voice_quality_id] == "belt" and has_belt_word_id < 0 and 
                voice_intervals[voice_quality_id][1] -  voice_intervals[voice_quality_id][0] >= 0.4):
                has_belt_word_id = phone_id
                has_belt_pitch_interval_id = voice_quality_id
            elif (voice_qualities[voice_quality_id] == "head" and has_belt_word_id >= 0 and has_head_word_id < 0 and 
                 voice_intervals[voice_quality_id][1] -  voice_intervals[voice_quality_id][0] >= 0.2):
                has_head_word_id = phone_id
                has_head_pitch_interval_id = voice_quality_id
            if has_belt_word_id > 0 and has_head_word_id > 0:
                break
    if has_belt_word_id > 0 and has_head_word_id > 0:
        interval = []
        interval.append(lyric.voice_quality_intervals[has_belt_word_id][has_belt_pitch_interval_id][0])
        interval.append(lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][0]-0.1)
        brow_movement.append("furrow")
        brow_intervals.append(interval)
        
        brow_movement.append("raise")
        interval = []
        interval.append(lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][0])
        interval.append(lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][1])
        brow_intervals.append(interval)
    elif has_belt_word_id > 0 and has_head_word_id < 0:
        interval = []
        interval.append(lyric.voice_quality_intervals[has_belt_word_id][has_belt_pitch_interval_id][0])
        interval.append(lyric.phoneme_intervals[sentence[-1]][1]-0.1)
        brow_movement.append("furrow")
        brow_intervals.append(interval)

### 1.1 Write the output to file

In [45]:
output ={"brow":[brow_movement, brow_intervals]}
jsonoutput = json.dumps(output)
with open(os.path.join(dir, file_name_template+'_animation_data.json'), 'w') as outfile:
    json.dump(jsonoutput, outfile)

## 1.b Compute Eye Brow Movements version 2
This will change the data format for saving, and use more information

In [5]:
# break the thing into sentence structures
sentences = []
current_sentence = []
for i in range(0, len(lyric.phoneme_list)):
    if lyric.phoneme_list[i] == "EOS_tag":
        sentences.append(current_sentence)
        current_sentence = []
    else:
        current_sentence.append(i)
sentences = sentences[1:] 
# sentence stores the indexes

In [34]:
# this is mostly for word level i.e. long vowels in a word. It would also be necessary to explore
# sentence level expressions (possibly from studying another song)
# this will have array of either 3 or 4 control points. 
# i.e. [[[ctrl_pt_1]...[ctrl_pt_k]], [[ctrl_pt_1]...[ctrl_pt_k]]] 
brow_movement = []
brow_ctrl_points = []
eye_movement = []
eye_ctrl_points = []

for i in range(0, len(sentences)):
    sentence = sentences[i]
    has_belt_pitch_interval_id = -1
    has_belt_word_id = -1
    has_head_pitch_interval_id = -1
    has_head_word_id = -1
    only_has_pitch_interval_id = -1
    only_has_head_word_id = -1
    
    for phone_id in sentence:
        phone = lyric.phoneme_list[phone_id]
        voice_qualities = lyric.coarse_voice_quality_lists[phone_id]
        voice_intervals = lyric.coarse_voice_quality_intervals[phone_id]
        # look to see if there are any parts that has belting
        for voice_quality_id in range(0, len(voice_qualities)):
            if (voice_qualities[voice_quality_id] == "belt" and has_belt_word_id < 0 and 
                voice_intervals[voice_quality_id][1] -  voice_intervals[voice_quality_id][0] >= 0.4):
                has_belt_word_id = phone_id
                has_belt_pitch_interval_id = voice_quality_id
            elif (voice_qualities[voice_quality_id] == "head" and has_belt_word_id >= 0 and has_head_word_id < 0 and 
                 voice_intervals[voice_quality_id][1] -  voice_intervals[voice_quality_id][0] >= 0.2):
                has_head_word_id = phone_id
                has_head_pitch_interval_id = voice_quality_id
            if has_belt_word_id > 0 and has_head_word_id > 0:
                break
                
    # do a second pass looking for brow raises
    if has_belt_word_id < 0 and has_head_word_id < 0:
        for phone_id in sentence:
            phone = lyric.phoneme_list[phone_id]
            pitch_change_interval = lyric.pitch_intervals[phone_id]
            pitch_change_slopes = lyric.pitch_slopes[phone_id]
            # look to see if there are any parts that has belting
            for vi in range(0, len(pitch_change_slopes)):
                if pitch_change_slopes[vi] >= 100 and only_has_head_word_id < 0:
                    only_has_head_word_id = phone_id 
                    only_has_pitch_interval_id = vi
                    break
            if only_has_head_word_id >= 0:
                break
                
    # this section of the code deal with having head voice only segments and the eye
    # brow raising in those 
    if only_has_head_word_id > 0:
        value = 5
        onset = lyric.pitch_intervals[only_has_head_word_id][0][0]
        start = lyric.pitch_intervals[only_has_head_word_id][only_has_pitch_interval_id][0] # where belting starts
        onset = start - onset
        end = lyric.pitch_intervals[only_has_head_word_id][-1][1]
        interval = generate_animation_ctrl_pts(start, end, value, 0.75, onset)
        brow_movement.append("raise")
        brow_ctrl_points.append(interval)
        
    # this section of the code deal with belting and the related physiological points of the eyes
    if has_belt_word_id > 0 and has_head_word_id > 0:
        # deal with the furrowing related movements
        value = 8
        start = lyric.voice_quality_intervals[has_belt_word_id][has_belt_pitch_interval_id][0] # where belting starts
        end = lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][0] # where head voice starts i.e. end 
                                                                                             # end of belting   
        interval = generate_animation_ctrl_pts(start, end, value, 0.75, 0.1)
        brow_movement.append("furrow")
        brow_ctrl_points.append(interval)
        
        # deal with the eyebrow raise related movements
         
        value = 5
        start = lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][0]
        end = lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][1]        
        interval = generate_animation_ctrl_pts(start, end, value, 0.75, 0.1)
        brow_ctrl_points.append(interval)
        brow_movement.append("raise")
        
        # deal with eye openning and closing
        value = 10
        start = lyric.voice_quality_intervals[has_belt_word_id][has_belt_pitch_interval_id][0] # where belting starts
        end = lyric.voice_quality_intervals[has_head_word_id][has_head_pitch_interval_id][0] # where head voice starts i.e. end 
        interval = generate_animation_ctrl_pts(start, end, value, 1, 0.1)
        eye_movement.append("closure")
        eye_ctrl_points.append(interval)
        
    elif has_belt_word_id > 0 and has_head_word_id < 0:
        # if there is belting, but sentence do not end with the use of head voice
        value = 8
        start = lyric.voice_quality_intervals[has_belt_word_id][has_belt_pitch_interval_id][0]
        start = min(start - 0.1, lyric.voice_quality_intervals[has_belt_word_id][0][0]) 
        end = lyric.phoneme_intervals[sentence[-1]][1]-0.1
        interval = generate_animation_ctrl_pts(start, end, value, 0.75, 0.1)
        brow_movement.append("furrow")
        brow_ctrl_points.append(interval)
        
        value = 10
        end = lyric.phoneme_intervals[sentence[-1]][1]-0.1 # end of this sentence if there is no next sentence
        if i < len(sentences)-1:
            # the end will be the first word from the next sentence if there are more than one sentence
            start_of_next_sentence = -1
            next_sentence = sentences[i+1]
            for phone_id in next_sentence:
                if lyric.phoneme_list[phone_id] in VOWELS:
                    start_of_next_sentence = phone_id
                    break
            end = lyric.phoneme_intervals[start_of_next_sentence][0]
        interval = generate_animation_ctrl_pts(start, end, value, 1, 0.05, 0.1)
        eye_movement.append("closure")
        eye_ctrl_points.append(interval)
        

In [35]:
output ={"brow":[brow_movement, brow_ctrl_points],
        "blink":[eye_movement, eye_ctrl_points]}
jsonoutput = json.dumps(output)
with open(os.path.join(dir, file_name_template+'_animation_data.json'), 'w') as outfile:
    json.dump(jsonoutput, outfile)

['furrow', 'raise', 'furrow'] [[[1.3849886621315188, 0], [1.484988662131519, 8], [3.79129404053288, 6.0], [4.5600625, 0]], [[6.464988662131519, 0], [6.594988662131519, 5], [8.64998866213152, 3.75], [9.33498866213152, 0]], [[11.16498866213152, 0], [11.26498866213152, 8], [13.69480966553288, 6.0], [14.50475, 0]]]


## 2. Bootleg Jali

In [4]:
from util.CMU2JALI import *
CMU_VOCABULARY = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G',
                  'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH',
                  'UW', 'V', 'W', 'Y', 'Z', 'ZH'])

In [5]:
phoneme_list = lyric.phoneme_list
phoneme_interval = lyric.phoneme_intervals

NameError: name 'lyric' is not defined

In [13]:
viseme_list = []
viseme_intervals = []

prev_vowel = "Uh"
next_vowel = ""
# pass 1
for i in range(0, len(phoneme_list)):
    if phoneme_list[i] in CMU_VOCABULARY:
        viseme_jali = CMU2VISEME[phoneme_list[i]]+"_pointer"
        start = phoneme_interval[i][0]
        end = phoneme_interval[i][1]
        if (end - start) <= 0.4:
            value = 5
            sustain = 0.75
            decay = 0.75
        else:
            value = 8
            sustain = 0.9
            decay = 0.95
        
        if not viseme_jali in VOWELS_JALI:
            value = 9
        viseme_curve = generate_animation_ctrl_pts(start, end, value, sustain=0.75, decay=decay, onset=0.12, offset=0.12)
        viseme_list.append(viseme_jali)
        viseme_intervals.append(viseme_curve)
# pass 2 enforcing co-articulation
    

In [14]:
output ={"viseme":[viseme_list, viseme_intervals]}
jsonoutput = json.dumps(output)
with open(os.path.join(dir, file_name_template+'_animation_data.json'), 'w') as outfile:
    json.dump(jsonoutput, outfile)

# Studying eye brow movements using facial landmarking

In [67]:
from util.facial_landmarking import *

In [None]:
video_title = ["video.mp4"]
video_path = ["E:/facial_data_analysis_videos/1"]
extract_landmarks_media_pipe(video_title[0],
                         video_path[0], save_annotated_video=True)