In [27]:
!pip install soundfile
!pip install pandas
!pip install scipy
!pip install librosa
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from pathlib import Path
import pandas as pd
import numpy as np
import soundfile as sf
import scipy
from scipy import fft
import functools
from torch import nn
import librosa
import math
import torchaudio 
import math
import yaml
torch.__version__


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


'1.9.0+cu102'

In [28]:
# data_dir should be structured as follows:
# - speaker_id_1/
# - speaker_id_1/sample_id1.wav
# - speaker_id_1/sample_id1.csv
# - speaker_id_1/sample_id2.wav
# - speaker_id_1/sample_id2.wav
# - speaker_id_2/sample_id1.wav
# ..
class VisemeDataset(Dataset):
    def __init__(self, data_dir, audio_transform, viseme_transform):
        self.viseme_transform= viseme_transform
        self.audio_transform = audio_transform
        self.audio_files = []
        self.visemes = []
        self.processed = {}
        for file in list(Path(data_dir).rglob("*.wav")):
            self.audio_files.append(file)
            self.visemes.append(str(file).replace("wav", "csv"))

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        if idx not in self.processed:
            fft, num_samples, mask = self.audio_transform(self.audio_files[idx])
            fft = torch.tensor(fft, dtype=torch.float)
            viseme_filename = self.visemes[idx]
            visemes = torch.tensor(self.viseme_transform(viseme_filename).values.astype(np.float32))       
            #assert(visemes.shape[0] == fft.shape[0])
            self.processed[idx] =fft, torch.tensor(mask), visemes, viseme_filename
        return self.processed[idx]

class SeparableConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
        super(SeparableConv1d, self).__init__()
        self.depthwise = nn.Conv1d(in_channels, in_channels, kernel_size=kernel_size, 
                               groups=in_channels, bias=bias, padding=1)
        self.pointwise = nn.Conv1d(in_channels, out_channels, 
                               kernel_size=1, bias=bias)
    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        return out
    
class Conv1dModel(nn.Module):
    def __init__(self, seq_length=633, n_ffts=2457, num_viseme=4, ks=256):
        super(Conv1dModel, self).__init__()
        self.conv = nn.Sequential(
            SeparableConv1d(seq_length,seq_length,ks),
            nn.ReLU(),
            SeparableConv1d(seq_length,seq_length,ks),
            nn.ReLU(),
        )
        self.attention = nn.MultiheadAttention(1951, 1, batch_first=True)
        #self.linear_relu_stack2 = nn.Sequential(
        #    SeparableConv1d(seq_length,seq_length,ks),
        #    #nn.Linear(seq_length, seq_length),
        #    nn.ReLU(),7
        #)
        self.linear_out = nn.Linear(1951, 11)

    def forward(self, x):
        o1 = self.conv(x)
        attn_output, attn_output_weights = self.attention(o1, o1, o1)
        attn_output = attn_output.tile((4,1,1,1)).transpose(0,1)
        o1 = self.linear_out(attn_output)        
        return o1

class BiLSTMModel(nn.Module):
    def __init__(self, input_dim=None, hidden_size=512, num_viseme=4):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(input_dim, hidden_size, 1, bidirectional=True)
        self.attention = nn.MultiheadAttention(hidden_size*2, 1, batch_first=True)       
        self.l1 = torch.nn.Linear(hidden_size*2, 1)
        self.l2 = torch.nn.Linear(hidden_size*2, 1)
        self.l3 = torch.nn.Linear(hidden_size*2, 1)
        self.l4 = torch.nn.Linear(hidden_size*2, 1)

    def forward(self, x):
        out_f, _ = self.lstm(x)
        #out_f = out_f[:,:,:self.embed_dim] + out_f[:,:,self.embed_dim:]
        attn_output, attn_output_weights = self.attention(out_f, out_f, out_f)
        return torch.stack([
            self.l1(attn_output),
            self.l2(attn_output),
            self.l3(attn_output),
            self.l4(attn_output)
        ],dim=1)




In [29]:
def preprocess_viseme(csv, pad_len_in_secs=None, target_framerate=None, blendshapes=None):
    csv = pd.read_csv(csv)
    
    # first, drop every nth row to reduce effective framerate    
    csv = csv.iloc[::int(59.97 / target_framerate)]
    
    pad_len = int(pad_len_in_secs * target_framerate)
        
    if(csv.shape[0] < pad_len):
        pad = pd.DataFrame(0, index=[i for i in range(pad_len - csv.shape[0])], columns=csv.columns)
        pad.pad(inplace=True)
        csv = pd.concat([csv, pad])
    else:
        csv = csv.iloc[:pad_len]
        #print("Visemes exceeded max length, truncate?")
    columns = list(csv.columns)
    columns.remove("Timecode")
    
    return csv[blendshapes] if blendshapes is not None else csv



In [30]:
def pad_audio(audio, sample_rate, pad_len_in_secs):
    # left-pad the audio so we have the left context when starting at the initial viseme
    pad_len_in_samples = pad_len_in_secs * resample_to 
    if len(audio.shape) > 1:
        audio = audio[0]
    if audio.shape[0] < pad_len_in_samples:
        audio = np.pad(audio, (0, (pad_len_in_secs * resample_to) - audio.shape[0]), constant_values=0.001)
    elif audio.shape[0] > pad_len_in_samples:
        audio = audio[:pad_len_in_samples]
    #audio = np.hstack([np.zeros((win_length*2,)), audio])
    return audio

def load_and_pad_audio(filepath, resample_to, pad_len_in_secs):
    audio, rate = sf.read(filepath)
    audio = librosa.resample(audio, rate, resample_to)
    audio = pad_audio(audio, resample_to, pad_len_in_secs)
    return audio

\---S1---/\---S2---/\---S3---/
WWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
____________\---V1---/
_____________AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

             \---S1---/\---S2---/\---S3---/
             WWWWWWWWWWWWWWWWWWWWWWWWWWWWWW
______________________\---V2---/
_____________AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA


A = audio sample
V1, V2, etc = viseme 1, viseme 2, etc (aka "frame", but not in the MFCC sense)
W = a window of audio samples that will be used as input (aka "frame" in the MFCC sense. to avoid confusion, we refer to this as the "window" and the viseme as the "frame")
S1, S2, S3 = STFT of a window 
num_frames == num_windows

audio_bins_per_window = the number of S per W
samples_per_bin = the length of each S (i.e. number of As)
bin_hop_length = the number of A between S1 and S2 (in the picture above, hop_length == len(S1) == len(S2) == samples_per_bin

in practice, we calculate as such:
          /-----------V2--------------\
/--------------V1------------\
\---S1---/\---S2---/\---S3---/\---S4---/\---S5---/\---S6---/
_______________AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

then at V1, we take {S1,S2,S3}, at V2 we take {S4,S5,S6}, etc

# STFT hop length should equal to length of a single viseme frame, in samples

In [31]:
def stft(filepath, 
         frame_len, 
         window_len, 
         stft_frames_per_window,
         resample_to, 
         pad_len_in_secs, 
         n_mels):
    
    audio = load_and_pad_audio(filepath, pad_len_in_secs, resample_to)
    #wdw = np.hanning(audio_window_in_samples)
    
    num_frames = audio.shape[0] // frame_len
    # TODO - mask for padding 
    # [1] * actual_seq_length + [0] * (padded_seq_length - actual_seq_length)
    
    # take the STFT of the entire audio file 
    # with a window size equivalent to the audio_window_in_samples / audio_bins_per_window
    n_fft = int(window_len / stft_frames_per_window)

    transformed = librosa.stft(audio, 
                               n_fft=n_fft,
                               win_length=n_fft,
                               hop_length=n_fft)
    
    melfb = librosa.filters.mel(resample_to, n_fft, n_mels=n_mels)    
    mels = np.dot(melfb, np.abs(transformed))
    
    log_mels = np.log(mels)
    
    output = mels_to_mfccs(log_mels, padded_seq_length, stft_frames_per_window, n_mels)

    return output, audio.shape[0], 



In [32]:
# let num_stft_frames be the number of STFT frames per audio sample 
# num_stft_frames == (audio_len / n_fft)
# (assuming FFT window length == hop length == n_ffts)
# coeffs will be B x num_stft_frames x num_mels
# every 
# if we've calculated the FFT size correctly, the hop length will be half 
# so stft_frames_per_window should be odd

def coeffs_to_windows(coeffs, num_frames, stft_frames_per_window, n_mels):
    
    output = np.zeros((num_frames, stft_frames_per_window*n_mels))
    
    # the number of frames to hop is just half the number of STFT frames per window
    hop_len = int(stft_frames_per_window / 2)
    
    for i in range(num_frames):
        c = coeffs[:, (i*hop_len):(i*hop_len)+stft_frames_per_window]
        # padding
        if(c.shape[1] < stft_frames_per_window):
            c = np.pad(c, [(0,0), (0, stft_frames_per_window - c.shape[1])], constant_values=0)
        
        output[i, :] = np.reshape(c, stft_frames_per_window*n_mels)
    return output


# featurize audio in exactly the same way as TensorflowTTS
# this enables mels from the synthesis step to be reused for viseme prediction

def tftts_mels(filepath, 
               frame_len=None,
               window_len=None,
               stft_frames_per_window=None,
               resample_to=None, 
               config=None):
    assert(resample_to==config["sampling_rate"])
    
    audio = load_and_pad_audio(filepath, resample_to, pad_len_in_secs)
     
    # this is (mostly) copied verbatim from https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/bin/preprocess.py @ 4a7d584 
    # except the hop length must be the size of the window (in samples) divided by the number of STFT frames per window
    if config["hop_size"] != window_len // stft_frames_per_window:
        print(config["hop_size"])
        assert(config["hop_size"] == window_len // stft_frames_per_window)
    D = librosa.stft(
        audio,
        n_fft=tftts_config["fft_size"],
        hop_length=config["hop_size"],
        win_length=config["win_length"] if "win_length" in config else config["fft_size"],
        window=config["window"],
        pad_mode="reflect",
    )
    
    S, _ = librosa.magphase(D)  # (#bins, #frames)
    fmin = 0 if config["fmin"] is None else config["fmin"]
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
    mel_basis = librosa.filters.mel(
        sr=config["sampling_rate"],
        n_fft=config["fft_size"],
        n_mels=config["num_mels"],
        fmin=fmin,
        fmax=fmax,
    )
    log_mels = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T  # (#frames, #bins)
    #print(log_mels.shape)
    # coeffs = fft.dct(log_mels.T) <-- where did this come from?
    coeffs = log_mels.T
    #print(coeffs.shape)
    num_frames = (resample_to * pad_len_in_secs) // frame_len

    output = coeffs_to_windows(coeffs, num_frames,  stft_frames_per_window, config["num_mels"])

    return output, audio.shape[0], [1] * log_mels.shape[0]


In [33]:
# the blendshapes that will used as input labels (and predictions)
blendshapes = ["MouthClose", "MouthFunnel", "MouthPucker", "JawOpen"]

# source framerate for raw viseme label input
framerate=59.97 

# actual framerate to use for viseme labels. 
# raw labels will be resampled/transformed (either averaged or simply dropped).
target_framerate = framerate / 2 

# the sample rate that audio will be resampled to
resample_to = 22050

# the duration of a viseme frame is (1 / target_framerate) seconds
frame_len = math.ceil(resample_to * (1 / target_framerate))

# all audio will be padded to the following size
pad_len_in_secs = 10

# the raw input for each viseme frame will be an audio window of size X 
# the middle sample of the viseme frame is aligned with the middle sample of the audio window
# this means, at the nominal "anchor sample" of the viseme frame, there will be 
# X/2 samples to the left and X/2 samples to the right
# let's use 0.5 seconds
window_len = (0.5 * resample_to) 

# this raw audio input will then be transformed into a number of STFT frames/coefficients
# each viseme frame will have this number of STFT frames, which will be the actual input at each timestep
# Since audio windows overlap, we won't want to waste cycles repeatedly computing the STFT across the whole audio sequence
# So in practice, we pre-calculate the STFT for the whole sequence, then just sub-sample the coefficients at each timestep
# when assigning STFT frames, the hop length will then just be half this value
stft_frames_per_window = 33

seq_length = math.ceil((pad_len_in_secs * resample_to) / frame_len)

hop_size = int(window_len // stft_frames_per_window)
hop_size

334

In [34]:
from tensorflow_tts.inference import AutoConfig
batch_size = 20

process_audio = None

USE_TFTTS_CONFIG=True

#config={"num_mels":80, "sampling_rate":22050,"fmin":80,"fmax":6000, "window":"hann", "fft_size":512, "hop_size":hop_size})
    
if USE_TFTTS_CONFIG:
    with open("/mnt/hdd_2tb/home/hydroxide/projects/TensorFlowTTS/preprocess/baker_preprocess.yaml", "r") as f:
        tftts_config = yaml.safe_load(f)
        tftts_config["hop_size"] = hop_size
        process_audio = functools.partial(tftts_mels, 
                                          frame_len=frame_len, 
                                          window_len=window_len,
                                          stft_frames_per_window=stft_frames_per_window, 
                                          resample_to=resample_to, 
                                          config=tftts_config)
else:
    num_mels=39
    process_audio = functools.partial(stft, 
                                        viseme_frame_len_in_samples=viseme_frame_len_in_samples, # this refers to the size of the viseme/audio window,
        audio_window_in_samples=audio_window_in_samples, # TODO - update these
        stft_frames_per_window=stft_frames_per_window,
        resample_to=resample_to, 
        pad_len_in_secs=pad_len_in_secs,
        n_mels=num_mels)
    
process_viseme = functools.partial(preprocess_viseme, 
                                   pad_len_in_secs=pad_len_in_secs, 
                                   blendshapes=blendshapes, 
                                   target_framerate=target_framerate)

training_data = VisemeDataset("./data/training/speaker_1/", 
                              process_audio, \
                              process_viseme)
test_data = VisemeDataset("./data/test/speaker_1/", 
                              process_audio, \
                              process_viseme)
def collate_samples(feat_tuples):
    return feat_tuples
    padded = torch.nn.utils.rnn.pad_sequence([f[0] for f in feat_tuples], batch_first=True, padding_value=0.0)
    #mask = torch.stack([feat_tuples[i][1] for i in range(len(feat_tuples))])
    labels = torch.nn.utils.rnn.pad_sequence([f[2] for f in feat_tuples], batch_first=True, padding_value=0.0)
    viseme_filenames = [feat_tuples[i][3] for i in range(len(feat_tuples))]
    
    return padded, labels,viseme_filenames
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

tftts_config

Using cuda device


{'sampling_rate': 22050,
 'fft_size': 512,
 'hop_size': 334,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512,
 'format': 'npy'}

In [35]:
input_dim = int(window_len / tftts_config["hop_size"] * tftts_config["num_mels"])
#print(stft_frames_per_window)
#print(tftts_config["hop_size"])
print(input_dim)
model = BiLSTMModel(hidden_size=512, input_dim=input_dim).to(device)

learning_rate = 0.0001
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
num_steps = 100000
print_loss_every = 50
eval_every = 500

batch = iter(train_dataloader)
accum_loss = 0

for t in range(num_steps):
    optimizer.zero_grad()
    train_features, train_mask, train_labels, train_files = next(batch, (None,None,None,None))
    #print(train_features.shape)
    #print(train_labels.shape)

    if train_features is None:
        batch = iter(train_dataloader)
        train_features, train_mask , train_labels, _ = next(batch)
    
    #train_mask = torch.unsqueeze(train_mask, 2)
    
    x = train_features.to(device)     
    
    y = train_labels.to(device) #* train_mask.to(device)
    
    preds = model(x) #* train_mask.to(device)
    
    #print(preds.shape)
    
    preds = torch.transpose(preds, 1,3).squeeze()
    #print(f"x {x.shape} preds {preds.shape} y {y.shape}")    
    #loss = torch.nn.functional.cross_entropy(preds, y)
    loss = torch.nn.functional.huber_loss(preds, y)
    #for i in range(preds.shape[1] - 1):
    #    loss += torch.nn.functional.cosine_embedding_loss(preds[:,i,:], preds[:,i+1,:], (torch.ones(preds.shape[0])).to(device))
    accum_loss += loss.item()
    if t % print_loss_every == 0:
        print(f"Step {t} Avg loss: {accum_loss / print_loss_every}")
        accum_loss = 0
    loss.backward()
    optimizer.step()
    
    
    
    if t > 0 and t % eval_every == 0:
        accum_loss = 0
        for test_features, test_mask, test_labels, _ in iter(test_dataloader):
            x = test_features.to(device)
            #x = torch.transpose(x, 1, 2)
            #x = torch.unsqueeze(x, dim=3)
            y = test_labels.to(device)
            preds = model(x)
            preds = torch.transpose(preds, 1,3)
            accum_loss += torch.nn.functional.mse_loss(preds, y).item()
            #accum_loss = torch.nn.functional.cross_entropy(preds, y)

        print(f"Test loss {accum_loss}")
        accum_loss = 0
    
#pred_probab = nn.Softmax(dim=1)(logits)
#y_pred = pred_probab.argmax(1)
#print(f"Predicted class: {y_pred}")

2640
Step 0 Avg loss: 0.00014744590036571025




Step 50 Avg loss: 0.01248262720182538
Step 100 Avg loss: 0.006710680797696113
Step 150 Avg loss: 0.0066580833867192265
Step 200 Avg loss: 0.00656532071530819
Step 250 Avg loss: 0.0064372603921219705
Step 300 Avg loss: 0.00647843457525596
Step 350 Avg loss: 0.006739921793341637
Step 400 Avg loss: 0.006546479612588882
Step 450 Avg loss: 0.0062484141043387354
Step 500 Avg loss: 0.006679508625529707




Test loss 0.025016657076776028
Step 550 Avg loss: 0.00644154991954565
Step 600 Avg loss: 0.006579419514164329
Step 650 Avg loss: 0.006532560531049967
Step 700 Avg loss: 0.006511105103418231
Step 750 Avg loss: 0.0065396463684737685
Step 800 Avg loss: 0.006689157695509493
Step 850 Avg loss: 0.006584917176514864
Step 900 Avg loss: 0.006538921017199755
Step 950 Avg loss: 0.006255963309668005
Step 1000 Avg loss: 0.006411707543302328
Test loss 0.02570384554564953
Step 1050 Avg loss: 0.00629792955936864
Step 1100 Avg loss: 0.0063398181553930046
Step 1150 Avg loss: 0.006382776042446494
Step 1200 Avg loss: 0.006392291933298111
Step 1250 Avg loss: 0.006244062534533441
Step 1300 Avg loss: 0.006384277283214033
Step 1350 Avg loss: 0.006148095363751054
Step 1400 Avg loss: 0.00653546198271215
Step 1450 Avg loss: 0.006418077489361167
Step 1500 Avg loss: 0.006615049056708813
Test loss 0.02088141767308116
Step 1550 Avg loss: 0.006351466856431216
Step 1600 Avg loss: 0.006736885532736778
Step 1650 Avg los

Step 9850 Avg loss: 0.0061470425873994825
Step 9900 Avg loss: 0.0059511422412469985
Step 9950 Avg loss: 0.006058933325111866
Step 10000 Avg loss: 0.005997133399359882
Test loss 0.03272348362952471
Step 10050 Avg loss: 0.006137542147189379
Step 10100 Avg loss: 0.005961675113067031
Step 10150 Avg loss: 0.005978057575412095
Step 10200 Avg loss: 0.0063346672896295786
Step 10250 Avg loss: 0.006146139418706298
Step 10300 Avg loss: 0.006047660261392593
Step 10350 Avg loss: 0.0060468278173357246
Step 10400 Avg loss: 0.006061419700272381
Step 10450 Avg loss: 0.006066484423354268
Step 10500 Avg loss: 0.0063798254728317265
Test loss 0.021921115461736917
Step 10550 Avg loss: 0.006094900658354163
Step 10600 Avg loss: 0.005985372327268123
Step 10650 Avg loss: 0.005904308834578842
Step 10700 Avg loss: 0.006312094386667013
Step 10750 Avg loss: 0.005930282201152295
Step 10800 Avg loss: 0.005987819293513894
Step 10850 Avg loss: 0.006143526223022491
Step 10900 Avg loss: 0.006132646608166397
Step 10950 Av

KeyboardInterrupt: 

In [None]:
#test_features, test_mask, test_labels, test_files = next(iter(test_dataloader))
#test_features, test_mask, test_labels, test_files = next(iter(train_dataloader))
##test_features = torch.transpose(test_features, 1, 2)
test_features = train_features
export_y_batch = model(test_features.to(device)) 
export_y = export_y_batch[0,:,:]
print(export_y.shape)
header = "Timecode,BlendShapeCount,eyeBlinkRight,eyeLookDownRight,eyeLookInRight,eyeLookOutRight,eyeLookUpRight,eyeSquintRight,eyeWideRight,eyeBlinkLeft,eyeLookDownLeft,eyeLookInLeft,eyeLookOutLeft,eyeLookUpLeft,eyeSquintLeft,eyeWideLeft,jawForward,jawRight,jawLeft,jawOpen,mouthClose,mouthFunnel,mouthPucker,mouthRight,mouthLeft,mouthSmileRight,mouthSmileLeft,mouthFrownRight,mouthFrownLeft,mouthDimpleRight,mouthDimpleLeft,mouthStretchRight,mouthStretchLeft,mouthRollLower,mouthRollUpper,mouthShrugLower,mouthShrugUpper,mouthPressRight,mouthPressLeft,mouthLowerDownRight,mouthLowerDownLeft,mouthUpperUpRight,mouthUpperUpLeft,browDownRight,browDownLeft,browInnerUp,browOuterUpRight,browOuterUpLeft,cheekPuff,cheekSquintRight,cheekSquintLeft,noseSneerRight,noseSneerLeft,tongueOut,HeadYaw,HeadPitch,HeadRoll,LeftEyeYaw,LeftEyePitch,LeftEyeRoll,RightEyeYaw,RightEyePitch,RightEyeRoll".split(',')
selected_output_indices = [header.index(x[0].lower() + x[1:]) for x in blendshapes]
num_visemes = len(blendshapes)
with open("output.csv", "w") as outfile:
    outfile.write(",".join(header) + "\n")
    timer_ms = 0
    print(export_y)
    for t in range(export_y.shape[1]):
        output = [str(0)] * len(header)
        second = str(int(timer_ms // 1000)).zfill(2)
        frame = (timer_ms % 1000) * target_framerate / 1000
        output[0] = f"00:00:{second}:{frame}"
        for viseme in range(num_visemes): 
            output[selected_output_indices[viseme]] = str(export_y[viseme,t,:].item())
        timer_ms += (1 / target_framerate) * 1000
        outfile.write(",".join(output) + "\n")

In [None]:
train_files[0]

In [None]:
new_header = "Timecode,BlendShapeCount,EyeBlinkLeft,EyeLookDownLeft,EyeLookInLeft,EyeLookOutLeft,EyeLookUpLeft,EyeSquintLeft,EyeWideLeft,EyeBlinkRight,EyeLookDownRight,EyeLookInRight,EyeLookOutRight,EyeLookUpRight,EyeSquintRight,EyeWideRight,JawForward,JawRight,JawLeft,JawOpen,MouthClose,MouthFunnel,MouthPucker,MouthRight,MouthLeft,MouthSmileLeft,MouthSmileRight,MouthFrownLeft,MouthFrownRight,MouthDimpleLeft,MouthDimpleRight,MouthStretchLeft,MouthStretchRight,MouthRollLower,MouthRollUpper,MouthShrugLower,MouthShrugUpper,MouthPressLeft,MouthPressRight,MouthLowerDownLeft,MouthLowerDownRight,MouthUpperUpLeft,MouthUpperUpRight,BrowDownLeft,BrowDownRight,BrowInnerUp,BrowOuterUpLeft,BrowOuterUpRight,CheekPuff,CheekSquintLeft,CheekSquintRight,NoseSneerLeft,NoseSneerRight,TongueOut,HeadYaw,HeadPitch,HeadRoll,LeftEyeYaw,LeftEyePitch,LeftEyeRoll,RightEyeYaw,RightEyePitch,RightEyeRoll"
remap = {h:(h[0].lower() + h[1:]) if h not in ["Timecode","BlendShapeCount","HeadYaw","HeadPitch","HeadRoll","LeftEyeYaw","LeftEyePitch","LeftEyeRoll","RightEyeYaw","RightEyePitch","RightEyeRoll"]  else h for h in new_header.split(",") }
for oh in remap.values():
    if oh not in header:
        print(oh)

def new_to_old(csv_df):
    return csv_df.rename(columns=remap)
df = preprocess_viseme("data/training/speaker_1/20210824_1/61.csv", pad_len_in_secs=pad_len_in_secs, 
                                   resample_to=target_framerate)
#print(df)
df = new_to_old(df)
cols = [c for c in df.columns if c not in ["jawOpen", "mouthClose", "Timecode"]]
df[cols] = 0
df = df[df["Timecode"] != 0]
df.to_csv("original.csv", index=False)
#new_to_old(df)[header].to_csv("original.csv", index=False)
df

In [36]:
torch.save(model, "bilstm.torch")

In [None]:
for item in ["MouthClose", "MouthFunnel", "MouthPucker", "JawOpen"]:
    print(new_header.split(",").index(item))


In [38]:
model = torch.load("bilstm.torch",map_location=torch.device('cpu'))
import torch.onnx 

# set the model to inference mode 
model.eval() 

# Let's create a dummy input tensor  
dummy_input = torch.randn(1, 119, 2640, requires_grad=True)  

# Export the model   
torch.onnx.export(model,         # model being run 
     dummy_input,       # model input (or a tuple for multiple inputs) 
     "bilstm.onnx",       # where to save the model  
     export_params=True,  # store the trained parameter weights inside the model file 
     opset_version=10,    # the ONNX version to export the model to 
     do_constant_folding=True,  # whether to execute constant folding for optimization 
     #input_names = ['modelInput'],   # the model's input names 
     #output_names = ['modelOutput'], # the model's output names 
     dynamic_axes={'modelInput' : {0 : 'batch_size'},    # variable length axes 
    'modelOutput' : {0 : 'batch_size'}}
                 ) 


  "or define the initial states (h0/c0) as inputs of the model. ")


In [39]:
import onnx
from onnx_tf.backend import prepare
import onnxruntime
model_onnx = onnx.load('bilstm.onnx')
tf_rep = prepare(model_onnx)
tf_rep.export_graph('./tf_model')



2021-09-19 00:50:56.130563: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:50:56.490301: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:50:56.491554: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:50:56.494104: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






2021-09-19 00:51:02.395646: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.






INFO:tensorflow:Assets written to: ./tf_model/assets


INFO:tensorflow:Assets written to: ./tf_model/assets


In [40]:
import tensorflow_model_optimization as tfmot

quant_aware_model = tfmot.quantization.keras.quantize_model(tf_rep)
quant_aware_model.summary()

ValueError: `to_quantize` can only be a `tf.keras.Model` instance. Use the `quantize_annotate_layer` API to handle individual layers.You passed an instance of type: TensorflowRep.

In [41]:
import tensorflow as tf


converter = tf.lite.TFLiteConverter.from_saved_model("./tf_model")
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                       tf.lite.OpsSet.SELECT_TF_OPS]
converter.allow_custom_ops=False
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_enable_resource_variables = True

converter.experimental_new_converter =True
tflite_model = converter.convert()

# Save the model
with open("bilstm.tflite", 'wb') as f:
    f.write(tflite_model)

2021-09-19 00:51:24.084527: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:351] Ignored output_format.
2021-09-19 00:51:24.084557: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:354] Ignored drop_control_dependency.
2021-09-19 00:51:24.084563: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored change_concat_input_ranges.
2021-09-19 00:51:24.085352: I tensorflow/cc/saved_model/reader.cc:38] Reading SavedModel from: ./tf_model
2021-09-19 00:51:24.098702: I tensorflow/cc/saved_model/reader.cc:90] Reading meta graph with tags { serve }
2021-09-19 00:51:24.098737: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: ./tf_model
2021-09-19 00:51:24.106997: I tensorflow/cc/saved_model/loader.cc:211] Restoring SavedModel bundle.
2021-09-19 00:51:24.148006: I tensorflow/cc/saved_model/loader.cc:195] Running initialization op on SavedModel bundle at path: ./tf_model
2021-09-19 00:51:24.1

In [42]:
interpreter = tf.lite.Interpreter(model_path="bilstm.tflite")
interpreter.allocate_tensors()
    
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_shape = input_details[0]['shape']
output_details

INFO: Created TensorFlow Lite delegate for select TF ops.
2021-09-19 00:51:47.989188: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:51:47.989570: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:51:47.989887: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:51:47.990275: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-19 00:51:47.990593: I tensorflow/stream_executor/cuda/

[{'name': 'StatefulPartitionedCall:0',
  'index': 919,
  'shape': array([  1,   4, 119,   1], dtype=int32),
  'shape_signature': array([  1,   4, 119,   1], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [None]:
csv = pd.read_csv("data/training/speaker_1/20210824_1/61.csv")
columns = [x for x in list(csv.columns) if "Eye" not in x]
columns.remove("Timecode")
columns.remove("BlendShapeCount")
csv[columns].var().sort_values()
#df = preprocess_viseme("data/training/speaker_1/20210824_1/61.csv", pad_len_in_secs=pad_len_in_secs, 
#                                   resample_to=target_framerate, blendshapes=["MouthClose","MouthFunnel"])
#df.shape
#[df.iloc[0]["EyeLookInLeft"]]
    #csv[columns] = pd.np.digitize(csv[columns], np.linspace(0,1,11))
    
    #split = csv["Timecode"].str.split(':')
    #minute = split.str[1].astype(int)
    #second = split.str[2].astype(int)
    #frame = split.str[3].astype(float)
    #minute -= minute[0]
    #ms
    #step = minute * 60 + second
    #csv["step"] = step
    #return csv.drop_duplicates(["step"])[["step", "MouthClose","MouthFunnel","MouthPucker","JawOpen"]]
    
# if we want to use softmax across each blendshape as a one-hot
    #return np.reshape(vals, (vals.shape[0], vals.shape[1], 1))
    #one_hot = np.zeros((vals.shape[0], vals.shape[1], 11, 1))
    #oh = np.eye(11)
    #for row in range(vals.shape[0]):
    #    for t in range(vals.shape[1]):
    #        one_hot[row, :, :, 0] = np.eye(11)[int(vals[row,t])-1]
    #return one_hot