RNN model to transform a sequence of phones:

sil sil sil h h h eh eh eh eh eh eh l l l oh oh oh oh oh oh

to a corresponding sequence of blendshape weights:

[ [0.2,0.0,0.9,0.1], [0.03,0.0,0.0,0.6], ... ]

The phone sequence (alignments) can be generated by forced alignment (for training data) or a TTS model (for inference data).


In [1]:
from src.dataset import VisemeAlignmentDataset


import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from collections import OrderedDict
import pandas as pd
import math

import numpy as np
import soundfile as sf
import scipy
import functools
from torch import nn
import math
import math
import yaml
from tensorflow_tts.inference import AutoConfig
import json

import torch.onnx 
import onnx
from onnx_tf.backend import prepare
import onnxruntime

from src.phones import load_symbols, combine_related_phones
from src.config import VisemeModelConfiguration

torch.__version__

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Using {} device'.format(device))

config = VisemeModelConfiguration(batch_size=20)
config.save("output/viseme_model.json")
config.model_config

symbol_ids = load_symbols("/home/hydroxide/projects/polyvox/polyvox_framework/assets/symbol_ids.txt")
symbol_ids_copy, pad_sym = combine_related_phones(symbol_ids)


 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Using cpu device


In [2]:
from src.viseme import preprocess_viseme 
from src.alignments import preprocess_alignments

from decimal import Decimal
import math

training_data = VisemeAlignmentDataset(
    "./data/training/", 
    functools.partial(
        preprocess_viseme, 
        blendshapes=config.sourceKeys, 
    ), 
    functools.partial(
        preprocess_alignments, 
        phone_ids=symbol_ids_copy,
        framerate=30
    ),
    pad_value=len(symbol_ids)+1
)
test_data = VisemeAlignmentDataset(
      "./data/test/", 
        functools.partial(
            preprocess_viseme, 
            blendshapes=config.sourceKeys, 
        ), 
        functools.partial(
            preprocess_alignments, 
            phone_ids=symbol_ids_copy,
            framerate=30
        ),
        pad_value=pad_sym
)

collate_fn=functools.partial(training_data.collate, pad_val=pad_sym)
train_dataloader = DataLoader(training_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
batch = iter(train_dataloader)

xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None))
xs



tensor([[56, 56, 56,  ..., 60, 60, 60],
        [56, 56, 56,  ..., 60, 60, 60],
        [56, 56, 56,  ..., 60, 60, 60],
        ...,
        [56, 56, 56,  ..., 60, 60, 60],
        [56, 56, 56,  ..., 60, 60, 60],
        [56, 56, 56,  ..., 60, 60, 60]], dtype=torch.int32)

In [3]:
class BiRNNModel(nn.Module):
    def __init__(self, phone_edim=128, phone_map=None, hdim=512, num_visemes=4):
        super(BiRNNModel, self).__init__()
        
        self.phone_embedding = nn.Embedding(len(phone_map),phone_edim)
        
        self.rnn = torch.nn.LSTM(phone_edim, hdim, 1, bidirectional=True, batch_first=True)
                
        self.proj_out = torch.nn.Sequential(
                torch.nn.Linear(hdim*2, hdim*2),
                torch.nn.ReLU(),
                torch.nn.Linear(hdim*2, num_visemes)
        )
        
        
    def forward(self, phones):
        phone_emb = self.phone_embedding(phones)
        out, (h,c) = self.rnn(phone_emb)
        return self.proj_out(out)

model = BiRNNModel(
    num_visemes=len(config.model_config["targetNames"]),
    hdim=256,
    phone_map=symbol_ids
).to(device)


learning_rate = 0.0001
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)



In [4]:
num_steps = 300000
print_loss_every = 100
eval_every = 1000

accum_loss = 0

for t in range(num_steps):
    optimizer.zero_grad()
        
    xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None,None))
    
    if xs is None:
        batch = iter(train_dataloader)
        xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None, None))

    preds = model(xs.to(device)) 
    # print(preds.size())
    # print(ys.size())
    
    loss = torch.nn.functional.mse_loss(preds, ys.to(device))   
    #loss = torch.nn.functional.huber_loss(preds, ys.to(device))
    loss.backward()
    optimizer.step()
    
    accum_loss += loss.item()
    if t > 0 and t % print_loss_every == 0:
        print(f"Step {t} Avg loss: {accum_loss / print_loss_every}")
        accum_loss = 0

    if t > 0 and t % eval_every == 0:
        accum_loss = 0
        for xs, ys, xlens, ylens, _ in iter(test_dataloader):
            preds = model(xs.to(device))
            accum_loss += torch.nn.functional.mse_loss(preds, ys.to(device)).item()
            #accum_loss += torch.nn.functional.huber_loss(preds, ys.to(device)).item()

        print(f"Test loss {accum_loss}")
        accum_loss = 0
    


Step 100 Avg loss: 290.2702086830139
Step 200 Avg loss: 3.267869622707367
Step 300 Avg loss: 2.018344486951828
Step 400 Avg loss: 1.3594227051734924
Step 500 Avg loss: 0.926131187081337
Step 600 Avg loss: 0.5963400167226791
Step 700 Avg loss: 0.4000048930943012
Step 800 Avg loss: 0.26594354927539826
Step 900 Avg loss: 0.2334109976887703
Step 1000 Avg loss: 0.1545709589868784
Test loss 5.065923064947128
Step 1100 Avg loss: 0.1536829497665167
Step 1200 Avg loss: 0.12930784290656447
Step 1300 Avg loss: 0.1308026854880154
Step 1400 Avg loss: 0.08978305112570524
Step 1500 Avg loss: 0.10377374183386565
Step 1600 Avg loss: 0.10120152022689581
Step 1700 Avg loss: 0.08646522745490075


KeyboardInterrupt: 

In [5]:
torch.save(model, f"output/{config.model_name}.torch")

In [6]:
model = torch.load(f"output/{config.model_name}.torch",map_location=torch.device('cpu'))

model.eval() 

dummy_input = torch.zeros(1, 99, requires_grad=False,dtype=torch.int)

# Export the model   
torch.onnx.export(model,         # model being run 
     dummy_input,      # model input (or a tuple for multiple inputs) 
     f"output/{config.model_name}.onnx",       # where to save the model  
     export_params=True,  # store the trained parameter weights inside the model file 
     opset_version=10,    # the ONNX version to export the model to 
     do_constant_folding=True,  # whether to execute constant folding for optimization 
     #input_names = ['audio_feats', 'text_feats'],   # the model's input names 
     output_names = ['modelOutput'], # the model's output names ,
) 

model_onnx = onnx.load(f'output/{config.model_name}.onnx')

tf_rep = prepare(model_onnx)
tf_rep.export_graph('./output/tf_model')

  "or define the initial states (h0/c0) as inputs of the model. ")
2022-02-07 16:49:08.186280: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-02-07 16:49:08.186372: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: AVINIUM
2022-02-07 16:49:08.186385: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: AVINIUM
2022-02-07 16:49:08.186624: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.57.2
2022-02-07 16:49:08.186726: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.57.2
2022-02-07 16:49:08.186740: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.57.2
2022-02-07 16:49:08.264045: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural N

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






2022-02-07 16:49:13.068273: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.






INFO:tensorflow:Assets written to: ./output/tf_model/assets


INFO:tensorflow:Assets written to: ./output/tf_model/assets


In [8]:
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_saved_model("./output/tf_model")
print("Built converter")

converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                       tf.lite.OpsSet.SELECT_TF_OPS]
converter.allow_custom_ops=False
#converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.experimental_enable_resource_variables = True

converter.experimental_new_converter =True
tflite_model = converter.convert()
print("Converted")

# Save the model
outfile=f'./output/{config.model_config["modelPath"]}'
with open(outfile, 'wb') as f:
    f.write(tflite_model)

Built converter
Converted


2022-02-07 16:49:53.980350: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:351] Ignored output_format.
2022-02-07 16:49:53.980388: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:354] Ignored drop_control_dependency.
2022-02-07 16:49:53.980395: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:360] Ignored change_concat_input_ranges.
2022-02-07 16:49:53.980626: I tensorflow/cc/saved_model/reader.cc:38] Reading SavedModel from: ./output/tf_model
2022-02-07 16:49:53.984889: I tensorflow/cc/saved_model/reader.cc:90] Reading meta graph with tags { serve }
2022-02-07 16:49:53.984925: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: ./output/tf_model
2022-02-07 16:49:53.994863: I tensorflow/cc/saved_model/loader.cc:211] Restoring SavedModel bundle.
2022-02-07 16:49:54.024315: I tensorflow/cc/saved_model/loader.cc:195] Running initialization op on SavedModel bundle at path: ./output/tf_model


In [None]:
interpreter = tf.lite.Interpreter(model_path=outfile)
interpreter.allocate_tensors()
    
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
#input_shape = input_details[0]['shape']
#output_details
input_details

In [None]:
csv = pd.read_csv("data/training/speaker_1/20210824_1/61.csv")
columns = [x for x in list(csv.columns) if "Eye" not in x]
columns.remove("Timecode")
columns.remove("BlendShapeCount")
csv[columns].var().sort_values()
#df = preprocess_viseme("data/training/speaker_1/20210824_1/61.csv", pad_len_in_secs=pad_len_in_secs, 
#                                   resample_to=target_framerate, blendshapes=["MouthClose","MouthFunnel"])
#df.shape
#[df.iloc[0]["EyeLookInLeft"]]
    #csv[columns] = pd.np.digitize(csv[columns], np.linspace(0,1,11))
    
    #split = csv["Timecode"].str.split(':')
    #minute = split.str[1].astype(int)
    #second = split.str[2].astype(int)
    #frame = split.str[3].astype(float)
    #minute -= minute[0]
    #ms
    #step = minute * 60 + second
    #csv["step"] = step
    #return csv.drop_duplicates(["step"])[["step", "MouthClose","MouthFunnel","MouthPucker","JawOpen"]]
    
# if we want to use softmax across each blendshape as a one-hot
    #return np.reshape(vals, (vals.shape[0], vals.shape[1], 1))
    #one_hot = np.zeros((vals.shape[0], vals.shape[1], 11, 1))
    #oh = np.eye(11)
    #for row in range(vals.shape[0]):
    #    for t in range(vals.shape[1]):
    #        one_hot[row, :, :, 0] = np.eye(11)[int(vals[row,t])-1]
    #return one_hot