RNN model to transform a sequence of phones:

sil sil sil h h h eh eh eh eh eh eh l l l oh oh oh oh oh oh

to a corresponding sequence of blendshape weights:

[ [0.2,0.0,0.9,0.1], [0.03,0.0,0.0,0.6], ... ]

The phone sequence (alignments) can be generated by forced alignment (for training data) or a TTS model (for inference data).


In [1]:
from src.dataset import VisemeAlignmentDataset


import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from collections import OrderedDict
import pandas as pd
import math

import numpy as np
import soundfile as sf
import functools
from torch import nn
import math
import math
import yaml

import torch.onnx 
import onnx
from onnx_tf.backend import prepare

from src.phones import load_symbols, combine_related_phones
from src.config import VisemeModelConfiguration

torch.__version__

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Using {} device'.format(device))

config = VisemeModelConfiguration(batch_size=6)
config.save("output/viseme_model.json")
config.model_config

# we need to load the raw phonetic symbols, then merge 
symbol_ids = load_symbols("/home/hydroxide/projects/polyvox/polyvox_framework/assets/symbol_ids.txt")
symbol_ids_copy, num_syms = combine_related_phones(symbol_ids)
pad_id = num_syms -1

with open("output/symbol_ids.txt", "w") as outfile:
    for symbol in symbol_ids_copy:
        outfile.write("%s %d\n" % (symbol, symbol_ids_copy[symbol]))

from src.viseme import preprocess_viseme 
from src.alignments import preprocess_alignments

import math

training_data = VisemeAlignmentDataset(
    "./data/training/", 
    functools.partial(
        preprocess_viseme, 
        blendshapes=config.sourceKeys, 
        framerate=30
    ), 
    functools.partial(
        preprocess_alignments, 
        phone_ids=symbol_ids_copy,
        framerate=30
    ),
    pad_value=pad_id
)
test_data = VisemeAlignmentDataset(
      "./data/test/", 
        functools.partial(
            preprocess_viseme, 
            blendshapes=config.sourceKeys, 
            framerate=30
        ), 
        functools.partial(
            preprocess_alignments, 
            phone_ids=symbol_ids_copy,
            framerate=30
        ),
        pad_value=pad_id
)

collate_fn=functools.partial(training_data.collate, pad_val=pad_id)
train_dataloader = DataLoader(training_data, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)
batch = iter(train_dataloader)

xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None))
xs

class BiRNNModel(nn.Module):
    def __init__(self, phone_edim=128, num_phones=None, hdim=512, num_visemes=4, bidirectional=True):
        super(BiRNNModel, self).__init__()
        
        self.phone_embedding = nn.Embedding(num_phones,phone_edim)
        
        self.rnn = torch.nn.LSTM(phone_edim, hdim, 1, bidirectional=bidirectional, batch_first=True)
        proj_dim = hdim*2 if bidirectional else hdim
        self.proj_out = torch.nn.Sequential(
                torch.nn.Linear(proj_dim, proj_dim),
                torch.nn.ReLU(),
                torch.nn.Linear(proj_dim, num_visemes)
        )
        
    def forward(self, phones):
        phone_emb = self.phone_embedding(phones)
        out, _ = self.rnn(phone_emb)
        return torch.clamp(
            self.proj_out(out),
            min=0,
            max=1)

model = BiRNNModel(
    num_visemes=len(config.model_config["targetNames"]),
    hdim=256,
    num_phones=num_syms,
    bidirectional=True
).to(device)


learning_rate = 0.000001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model
print(f"Training dataset length : {len(training_data)}")

 The versions of TensorFlow you are currently using is 2.8.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Using cuda device
ID 20211003_MySlate_25_18 39 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37]
ID 20211003_MySlate_25_19 81 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79]
ID 20211003_MySlate_25_20 51 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]
ID 20211003_MySlate_25_21 54 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52]
ID 20211003_MySlate_25_22 42 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40]
ID 20211003_MySlate_25_23 45 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43]
ID 20211003_MySlate_25_24 72 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67, 70]
ID 20211003_MySlate_25_25 51 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]
ID 20211003_MySlate_25_26 57 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55]
ID 20211003_MySlate_25_27 78 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 

In [2]:
import functools
from src.alignments import preprocess_alignments
from src.dataset import VisemeAlignmentDataset
ds = VisemeAlignmentDataset("data/training",functools.partial(
            preprocess_viseme, 
            blendshapes=config.sourceKeys, 
            framerate=30
        ), functools.partial(
            preprocess_alignments, 
            phone_ids=symbol_ids_copy,
            framerate=30
        ),None)
print(ds[0][0].size())
ds[0][1].size()

ID 20211003_MySlate_25_18 39 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37]
ID 20211003_MySlate_25_19 81 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79]
ID 20211003_MySlate_25_20 51 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]
ID 20211003_MySlate_25_21 54 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52]
ID 20211003_MySlate_25_22 42 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40]
ID 20211003_MySlate_25_23 45 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43]
ID 20211003_MySlate_25_24 72 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67, 70]
ID 20211003_MySlate_25_25 51 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]
ID 20211003_MySlate_25_26 57 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55]
ID 20211003_MySlate_25_27 78 [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55

torch.Size([39, 4])

In [3]:
def make_pad_mask(lengths):
    bs = lengths.size(0)
    maxlen = lengths.max()

    seq_range = torch.arange(0, maxlen, dtype=torch.int64).to(lengths.device)
    
    seq_range_expand = seq_range.unsqueeze(0).repeat(bs, 1)
    seq_length_expand = lengths.unsqueeze(1).repeat(1, maxlen)

    mask = seq_range_expand >= seq_length_expand
    
    return mask

In [4]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

epoch = 0
num_steps = 300000
log_train_steps = 25

accum_loss = 0
mse_loss = nn.MSELoss(reduction='none')

for t in range(num_steps):
    
    optimizer.zero_grad()
        
    xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None,None))

    if xs is None:
        accum_loss = 0
        for xs, ys, xlens, ylens, _ in iter(test_dataloader):            
            preds = model(xs.to(device))
            loss = mse_loss(preds, ys.to(device))
            mask = make_pad_mask(torch.tensor(xlens))
            loss[mask] = 0
            accum_loss += loss.sum()
        writer.add_scalar('Loss/test', accum_loss, epoch)
        epoch += 1
        
        accum_loss = 0
        batch = iter(train_dataloader)
        xs, ys, xlens, ylens, _ = next(batch, (None,None,None,None, None))

    preds = model(xs.to(device)) 
    
    loss = mse_loss(preds, ys.to(device))
    
    mask = make_pad_mask(torch.tensor(xlens))

    loss[mask] = 0
    loss = loss.sum()
    loss.backward()
    optimizer.step()
    
    accum_loss += loss.item()
    if t > 0 and t % log_train_steps == 0:
        writer.add_scalar('Loss/train', accum_loss / log_train_steps, t)
        accum_loss = 0
        
    


KeyboardInterrupt: 

In [None]:
preprocess_alignments("/home/hydroxide/projects/polyvox/viseme_prediction/data/training/NickF4/MySlate_45_Nic1.2_3.ctm", phone_ids=symbol_ids_copy,framerate=14)
preprocess_viseme("/home/hydroxide/projects/polyvox/viseme_prediction/data/training/NickF4/MySlate_45_Nic1.2_3.csv")

In [5]:
torch.save(model, f"output/{config.model_name}.torch")
torch.__version__

'1.11.0+cu102'

In [6]:
model = torch.load(f"output/{config.model_name}.torch",map_location=torch.device('cpu'))

model.eval() 

# Export the model   
torch.onnx.export(model,         # model being run 
     torch.zeros(1, 181,dtype=torch.int),      # model input (or a tuple for multiple inputs) 
     f"output/{config.model_name}.onnx",       # where to save the model  
     export_params=True,  # store the trained parameter weights inside the model file 
     opset_version=10,    # the ONNX version to export the model to 
     do_constant_folding=True,  # whether to execute constant folding for optimization 
     input_names = ['phones'],   # the model's input names 
     dynamic_axes={"phones":[1]},
     output_names = ['modelOutput'], # the model's output names ,
) 



  "Automatically generated names will be applied to each dynamic axes of input {}".format(key))
  "or define the initial states (h0/c0) as inputs of the model. ")


In [None]:
import tensorflow
tensorflow.__version__
model_onnx = onnx.load(f'output/{config.model_name}.onnx')

tf_rep = prepare(model_onnx)
tf_rep.export_graph('./output/tf_model')

In [None]:
model_onnx.graph.input

In [None]:
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_saved_model("./output/tf_model")
print("Built converter")

converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS,
#  tf.lite.OpsSet.SELECT_TF_OPS
]
#converter._experimental_lower_tensor_list_ops = False

converter.allow_custom_ops=False
#converter.optimizations = [tf.lite.Optimize.DEFAULT]
#converter.experimental_enable_resource_variables = True
#converter.experimental_new_quantizer = False

#converter.experimental_new_converter =True
tflite_model = converter.convert()
print("Converted")

# Save the model
#outfile=f'./output/{config.model_config["modelPath"]}'
outfile=f'./output/bilstm.tflite'
with open(outfile, 'wb') as f:
    f.write(tflite_model)

In [None]:
interpreter = tf.lite.Interpreter(model_path=outfile)
    
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

interpreter.resize_tensor_input(input_details[0]["index"],[1,181])

interpreter.allocate_tensors()

xs = tf.zeros([1,181], tf.int32)

interpreter.set_tensor(input_details[0]['index'], xs)
interpreter.invoke()

input_details

In [None]:
csv = pd.read_csv("data/training/speaker_1/20210824_1/61.csv")
columns = [x for x in list(csv.columns) if "Eye" not in x]
columns.remove("Timecode")
columns.remove("BlendShapeCount")
csv[columns].var().sort_values()
#df = preprocess_viseme("data/training/speaker_1/20210824_1/61.csv", pad_len_in_secs=pad_len_in_secs, 
#                                   resample_to=target_framerate, blendshapes=["MouthClose","MouthFunnel"])
#df.shape
#[df.iloc[0]["EyeLookInLeft"]]
    #csv[columns] = pd.np.digitize(csv[columns], np.linspace(0,1,11))
    
    #split = csv["Timecode"].str.split(':')
    #minute = split.str[1].astype(int)
    #second = split.str[2].astype(int)
    #frame = split.str[3].astype(float)
    #minute -= minute[0]
    #ms
    #step = minute * 60 + second
    #csv["step"] = step
    #return csv.drop_duplicates(["step"])[["step", "MouthClose","MouthFunnel","MouthPucker","JawOpen"]]
    
# if we want to use softmax across each blendshape as a one-hot
    #return np.reshape(vals, (vals.shape[0], vals.shape[1], 1))
    #one_hot = np.zeros((vals.shape[0], vals.shape[1], 11, 1))
    #oh = np.eye(11)
    #for row in range(vals.shape[0]):
    #    for t in range(vals.shape[1]):
    #        one_hot[row, :, :, 0] = np.eye(11)[int(vals[row,t])-1]
    #return one_hot