In [2]:
!pip install soundfile
!pip install pandas
!pip install scipy
!pip install librosa
import torch
from torch.utils.data import Dataset
from pathlib import Path
import pandas as pd
import numpy as np
import soundfile as sf
import scipy
from scipy import fft
import functools
from torch import nn
import librosa


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [66]:
def stft(filepath, fft_size=348, hop_length = 348, win_length = 348, resample_to=22050):
    wdw = np.hanning(win_length)
    audio, rate = sf.read(filepath)
    audio = librosa.resample(audio, rate, resample_to)
    output = np.zeros((audio.shape[0] // hop_length, int(fft_size / 2 + 1)))
    i = 0
    while i < output.shape[0]:
        j = i*hop_length
        k = j+win_length
        if k <= audio.shape[0]:
            windowed = wdw * audio[j:k]
        elif j >= audio.shape[0]:
            break
        else:
            padded = np.zeros(win_length)
            padded[0:audio.shape[0] - j] = audio[j:]
            windowed = wdw * padded
        output[i,:] = fft.rfft(windowed,n=fft_size)
        i += 1
    return output, audio.shape[0]

def preprocess_viseme(csv, blendshapes=None):
    csv = pd.read_csv(csv)
    split = csv["Timecode"].str.split(':')
    minute = split.str[1].astype(int)
    second = split.str[2].astype(int)
    frame = split.str[3].astype(float)
    #minute -= minute[0]
    #ms
    step = minute * 60 + second
    csv["step"] = step
    #return csv.drop_duplicates(["step"])[["step", "MouthClose","MouthFunnel","MouthPucker","JawOpen"]]
    if blendshapes is None:
        return csv
    return csv[["Timecode","step"] + blendshapes]

# data_dir should be structured as follows:
# - speaker_id_1/
# - speaker_id_1/sample_id1.wav
# - speaker_id_1/sample_id1.csv
# - speaker_id_1/sample_id2.wav
# - speaker_id_1/sample_id2.wav
# - speaker_id_2/sample_id1.wav
# ..
class VisemeDataset(Dataset):
    def __init__(self, data_dir, audio_transform, viseme_transform):
        self.audio_transform = audio_transform
        self.audio_files = []
        self.visemes = []
        for file in list(Path(data_dir).rglob("*.wav")):
            self.audio_files.append(file)
            self.visemes.append(str(file).replace("wav", "csv"))

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        visemes = pd.read_csv(self.visemes[idx])
        fft, _ = self.audio_transform(self.audio_files[idx])
        # return STFT, viseme tensor
        return fft, torch.tensor(self.viseme_transform(visemes))
ds = VisemeDataset("./training_data", functools.partial(stft), preprocess_viseme)


In [63]:
fft_size=348
hop_length = 348
win_length = 348
sample_rate = 22050
data, num_samples = stft("./training_data/speaker_1/1.wav", fft_size=fft_size, hop_length=hop_length, win_length=win_length, resample_to=sample_rate)

blendshapes = ["MouthClose", "MouthFunnel", "MouthPucker", "JawOpen"]
labels = preprocess_viseme("training_data/speaker_1/1.csv", blendshapes)

framerate=59.97 # source framerate for video

# duration (ms) for each audio window
# this also corresponds to length of time for each output viseme
window_size = (win_length / sample_rate) * 1000




15.78231292517007

In [64]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

class SeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
        super(SeparableConv2d, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, 
                               groups=in_channels, bias=bias, padding=1)
        self.pointwise = nn.Conv2d(in_channels, out_channels, 
                               kernel_size=1, bias=bias)

    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        return out

class NeuralNetwork(nn.Module):
    def __init__(self, fft_length=1024, num_viseme=4):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            SeparableConv2d(1,1,4),
            #nn.Linear(174, 174),
            nn.ReLU(),
            SeparableConv2d(1,1,2),
            nn.ReLU(),
            #nn.Linear(175, 175),
            nn.ReLU(),
            #nn.Linear(1, fft_length),
            #nn.Linear(fft_length, num_viseme),
        )
        self.linear_out = nn.Linear(175, num_viseme)
        #self.attn = nn.MultiheadAttention(192, 4)

    def forward(self, x):
        return self.linear_out(torch.sum(self.linear_relu_stack(x), 2))
    
model = NeuralNetwork().to(device)

learning_rate = 0.00005
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

for t in range(2000):
    x = torch.Tensor([[stft("training_data/speaker_1/1.wav")[0]]]).to(device)
    y = torch.tensor(labels[blendshapes].values, dtype=torch.float).to(device)
    
    preds = model(x)
    
    loss = torch.nn.functional.mse_loss(preds, y)
    print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


#pred_probab = nn.Softmax(dim=1)(logits)
#y_pred = pred_probab.argmax(1)
#print(f"Predicted class: {y_pred}")

Using cpu device
tensor(0.0837, grad_fn=<MseLossBackward>)




tensor(0.0824, grad_fn=<MseLossBackward>)
tensor(0.0814, grad_fn=<MseLossBackward>)
tensor(0.0806, grad_fn=<MseLossBackward>)
tensor(0.0800, grad_fn=<MseLossBackward>)
tensor(0.0794, grad_fn=<MseLossBackward>)
tensor(0.0788, grad_fn=<MseLossBackward>)
tensor(0.0783, grad_fn=<MseLossBackward>)
tensor(0.0779, grad_fn=<MseLossBackward>)
tensor(0.0774, grad_fn=<MseLossBackward>)
tensor(0.0770, grad_fn=<MseLossBackward>)
tensor(0.0766, grad_fn=<MseLossBackward>)
tensor(0.0762, grad_fn=<MseLossBackward>)
tensor(0.0758, grad_fn=<MseLossBackward>)
tensor(0.0755, grad_fn=<MseLossBackward>)
tensor(0.0751, grad_fn=<MseLossBackward>)
tensor(0.0748, grad_fn=<MseLossBackward>)
tensor(0.0745, grad_fn=<MseLossBackward>)
tensor(0.0742, grad_fn=<MseLossBackward>)
tensor(0.0738, grad_fn=<MseLossBackward>)
tensor(0.0735, grad_fn=<MseLossBackward>)
tensor(0.0732, grad_fn=<MseLossBackward>)
tensor(0.0729, grad_fn=<MseLossBackward>)
tensor(0.0727, grad_fn=<MseLossBackward>)
tensor(0.0724, grad_fn=<MseLossBac

tensor(0.0439, grad_fn=<MseLossBackward>)
tensor(0.0437, grad_fn=<MseLossBackward>)
tensor(0.0436, grad_fn=<MseLossBackward>)
tensor(0.0435, grad_fn=<MseLossBackward>)
tensor(0.0434, grad_fn=<MseLossBackward>)
tensor(0.0433, grad_fn=<MseLossBackward>)
tensor(0.0432, grad_fn=<MseLossBackward>)
tensor(0.0431, grad_fn=<MseLossBackward>)
tensor(0.0430, grad_fn=<MseLossBackward>)
tensor(0.0428, grad_fn=<MseLossBackward>)
tensor(0.0427, grad_fn=<MseLossBackward>)
tensor(0.0426, grad_fn=<MseLossBackward>)
tensor(0.0425, grad_fn=<MseLossBackward>)
tensor(0.0424, grad_fn=<MseLossBackward>)
tensor(0.0423, grad_fn=<MseLossBackward>)
tensor(0.0422, grad_fn=<MseLossBackward>)
tensor(0.0421, grad_fn=<MseLossBackward>)
tensor(0.0420, grad_fn=<MseLossBackward>)
tensor(0.0419, grad_fn=<MseLossBackward>)
tensor(0.0418, grad_fn=<MseLossBackward>)
tensor(0.0417, grad_fn=<MseLossBackward>)
tensor(0.0416, grad_fn=<MseLossBackward>)
tensor(0.0415, grad_fn=<MseLossBackward>)
tensor(0.0414, grad_fn=<MseLossBac

tensor(0.0287, grad_fn=<MseLossBackward>)
tensor(0.0286, grad_fn=<MseLossBackward>)
tensor(0.0286, grad_fn=<MseLossBackward>)
tensor(0.0285, grad_fn=<MseLossBackward>)
tensor(0.0285, grad_fn=<MseLossBackward>)
tensor(0.0284, grad_fn=<MseLossBackward>)
tensor(0.0284, grad_fn=<MseLossBackward>)
tensor(0.0284, grad_fn=<MseLossBackward>)
tensor(0.0283, grad_fn=<MseLossBackward>)
tensor(0.0283, grad_fn=<MseLossBackward>)
tensor(0.0282, grad_fn=<MseLossBackward>)
tensor(0.0282, grad_fn=<MseLossBackward>)
tensor(0.0281, grad_fn=<MseLossBackward>)
tensor(0.0281, grad_fn=<MseLossBackward>)
tensor(0.0280, grad_fn=<MseLossBackward>)
tensor(0.0280, grad_fn=<MseLossBackward>)
tensor(0.0280, grad_fn=<MseLossBackward>)
tensor(0.0279, grad_fn=<MseLossBackward>)
tensor(0.0279, grad_fn=<MseLossBackward>)
tensor(0.0278, grad_fn=<MseLossBackward>)
tensor(0.0278, grad_fn=<MseLossBackward>)
tensor(0.0277, grad_fn=<MseLossBackward>)
tensor(0.0277, grad_fn=<MseLossBackward>)
tensor(0.0277, grad_fn=<MseLossBac

tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0240, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBackward>)
tensor(0.0239, grad_fn=<MseLossBac

KeyboardInterrupt: 

In [65]:
header = "Timecode,BlendShapeCount,eyeBlinkRight,eyeLookDownRight,eyeLookInRight,eyeLookOutRight,eyeLookUpRight,eyeSquintRight,eyeWideRight,eyeBlinkLeft,eyeLookDownLeft,eyeLookInLeft,eyeLookOutLeft,eyeLookUpLeft,eyeSquintLeft,eyeWideLeft,jawForward,jawRight,jawLeft,jawOpen,mouthClose,mouthFunnel,mouthPucker,mouthRight,mouthLeft,mouthSmileRight,mouthSmileLeft,mouthFrownRight,mouthFrownLeft,mouthDimpleRight,mouthDimpleLeft,mouthStretchRight,mouthStretchLeft,mouthRollLower,mouthRollUpper,mouthShrugLower,mouthShrugUpper,mouthPressRight,mouthPressLeft,mouthLowerDownRight,mouthLowerDownLeft,mouthUpperUpRight,mouthUpperUpLeft,browDownRight,browDownLeft,browInnerUp,browOuterUpRight,browOuterUpLeft,cheekPuff,cheekSquintRight,cheekSquintLeft,noseSneerRight,noseSneerLeft,tongueOut,HeadYaw,HeadPitch,HeadRoll,LeftEyeYaw,LeftEyePitch,LeftEyeRoll,RightEyeYaw,RightEyePitch,RightEyeRoll"
header_indices = header.split(',')
selected_output_indices = [header_indices.index(x[0].lower() + x[1:]) for x in blendshapes]
print(selected_output_indices)
with open("output.csv", "w") as outfile:
    outfile.write(header + "\n")
    timer_ms = 0
    for i in range(y.shape[0]):
        output = [str(0)] * len(header_indices)
        for j in range(y.shape[1]): 
            second = str(int(timer_ms // 1000)).zfill(2)
            frame = (timer_ms % 1000) * framerate / 1000
            output[0] = f"00:00:{second}:{frame}"
            output[selected_output_indices[j]] = str(y[i,j].item())
        timer_ms += window_size
        outfile.write(",".join(output) + "\n")

[20, 21, 22, 19]


In [68]:
columns = {'two':'new_name'}

def new_to_old(csv_df):
    for label in csv_df.columns:
        csv_df
new_to_old(preprocess_viseme("training_data/speaker_1/1.csv"))    
    

Timecode
BlendShapeCount
EyeBlinkLeft
EyeLookDownLeft
EyeLookInLeft
EyeLookOutLeft
EyeLookUpLeft
EyeSquintLeft
EyeWideLeft
EyeBlinkRight
EyeLookDownRight
EyeLookInRight
EyeLookOutRight
EyeLookUpRight
EyeSquintRight
EyeWideRight
JawForward
JawRight
JawLeft
JawOpen
MouthClose
MouthFunnel
MouthPucker
MouthRight
MouthLeft
MouthSmileLeft
MouthSmileRight
MouthFrownLeft
MouthFrownRight
MouthDimpleLeft
MouthDimpleRight
MouthStretchLeft
MouthStretchRight
MouthRollLower
MouthRollUpper
MouthShrugLower
MouthShrugUpper
MouthPressLeft
MouthPressRight
MouthLowerDownLeft
MouthLowerDownRight
MouthUpperUpLeft
MouthUpperUpRight
BrowDownLeft
BrowDownRight
BrowInnerUp
BrowOuterUpLeft
BrowOuterUpRight
CheekPuff
CheekSquintLeft
CheekSquintRight
NoseSneerLeft
NoseSneerRight
TongueOut
HeadYaw
HeadPitch
HeadRoll
LeftEyeYaw
LeftEyePitch
LeftEyeRoll
RightEyeYaw
RightEyePitch
RightEyeRoll
step
