In [None]:
from pydub import AudioSegment
from pathlib import PurePath
import pandas as pd
import numpy as np
import os

In [None]:
rhubarbFilePath = '.\\Rhubarb-Lip-Sync-1.13.0-Windows\\Rhubarb-Lip-Sync-1.13.0-Windows\\rhubarb.exe'

In [None]:
df = pd.read_csv('./dataset/test-clean.csv')

In [None]:
df.head()

In [None]:
new_df = df[['subset', 'reader_id', 'chapter_id']]
new_df.head()

In [None]:
npdf = new_df.to_numpy()
npdf[0:10]

In [None]:
filePaths = []

for el in npdf:
    # print(el)
    subset = el[0].strip()
    readerid = el[1]
    chapterid = el[2]
    folderPath = f'./dataset/{subset}/LibriSpeech/{subset}/{readerid}/{chapterid}/'
    textFile = f'{readerid}-{chapterid}.trans.txt'
    # print(folderPath + textFile)
    with open(folderPath + textFile, 'r') as f:
        lines = f.readlines()
        for line in lines:
            filename = line.split(' ')[0]
            # print(folderPath + filename + '.flac')
            filePaths.append(folderPath + filename + '.flac')

In [None]:
filePaths[0:10]

In [None]:
total = len(filePaths)
total

In [None]:
for i in range(0, len(filePaths)):
    filePath = filePaths[i]
    file_path = PurePath(filePath)
    flac_tmp_audio_data = AudioSegment.from_file(file_path, file_path.suffix[1:])
    flac_tmp_audio_data.export(str(file_path).replace(file_path.suffix, "") + ".wav", format="wav")
    wavFilePath = str(file_path).replace(file_path.suffix, "") + ".wav"
    jsonFilePath = wavFilePath.replace('.wav', '.json')
    # print(f'{rhubarbFilePath} -f json {wavFilePath} -o {jsonFilePath}')
    res = os.system(f'{rhubarbFilePath} -f json {wavFilePath} -o {jsonFilePath}')
    if(res != 0):
        print(f"{filePath} processing failed")
        break
    
    if(100 * i/total % 10 == 0):
        print(f"{i} ({100 * i/total} %) files processed")

In [None]:
filePaths_wav = [el.replace('.flac', '.wav') for el in filePaths]
filePaths_json = [el.replace('.flac', '.json') for el in filePaths]

In [None]:
import json
import librosa
import warnings
warnings.filterwarnings('ignore')

In [None]:
features = []

for idx in range(0, len(filePaths_wav)):
    jsonFile = open(filePaths_json[idx], 'r')
    jsonData = json.load(jsonFile)
    audio = AudioSegment.from_wav(filePaths_wav[idx])
    timestamps = jsonData['mouthCues']
    os.mkdir(filePaths_wav[idx].replace('.wav', ''))
    # print(timestamps)
    for i in range(0, len(timestamps)):
        start = timestamps[i]['start']*1000
        end = timestamps[i]['end']*1000
        newAudio = audio[start:end]
        newAudio.export(filePaths_wav[idx].replace('.wav', '') + '/' + str(i) + '.wav', format="wav")
        data, samplingRate = librosa.load(filePaths_wav[idx].replace('.wav', '') + '/' + str(i) + '.wav')
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=samplingRate, n_mfcc=50).T, axis=0)
        features.append([mfccs, timestamps[i]['value']])
    
    if(100 * (idx+1)/len(filePaths_wav) % 5 == 0):
        print(f"{idx+1} ({100 * (idx+1)/len(filePaths_wav)} %) files processed")

In [None]:
features = []

for idx in range(0, len(filePaths_wav)):
    jsonFile = open(filePaths_json[idx], 'r')
    jsonData = json.load(jsonFile)
    audio = AudioSegment.from_wav(filePaths_wav[idx])
    timestamps = jsonData['mouthCues']
    # os.mkdir(filePaths_wav[idx].replace('.wav', ''))
    # print(timestamps)
    for i in range(0, len(timestamps)):
        start = timestamps[i]['start']*1000
        end = timestamps[i]['end']*1000
        newAudio = audio[start:end]
        # newAudio.export(filePaths_wav[idx].replace('.wav', '') + '/' + str(i) + '.wav', format="wav")
        data, samplingRate = librosa.load(filePaths_wav[idx].replace('.wav', '') + '/' + str(i) + '.wav')
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=samplingRate, n_mfcc=40).T, axis=0)
        features.append([mfccs, timestamps[i]['value']])
    
    if(100 * (idx+1)/len(filePaths_wav) % 5 == 0):
        print(f"{idx+1} ({100 * (idx+1)/len(filePaths_wav)} %) files processed")

In [None]:
len(features)

In [None]:
dataset = pd.DataFrame(features, columns=['features', 'mouthCue'])
dataset.head()

In [None]:
dataset.to_csv('./dataset/extractedFeatures2.csv', index=False)

In [None]:
X = np.array(dataset.features.tolist())
y = np.array(dataset.mouthCue.tolist())

In [None]:
X.shape

In [None]:
y_new = np.array(pd.get_dummies(y))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.4, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state = 42)

In [None]:
X_test.shape, X_train.shape, X_val.shape

In [None]:
y_train.shape, y_test.shape, y_val.shape

# Model Creation

In [None]:
import tensorflow as tf
import tensorflow.keras as keras

In [None]:
model = keras.Sequential([
    #first layer
    keras.layers.Dense(1024, input_shape=(40,)),
    keras.layers.Dense(512, activation='relu'),
    # keras.layers.Dropout(0.5),

    #second layer
    keras.layers.Dense(256, activation='relu'),
    # keras.layers.Dropout(0.5),

    #third layer
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    # keras.layers.Dropout(0.5),

    #output layer
    keras.layers.Dense(y_train.shape[1], activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
batch_size = 32
epochs = 500

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

In [None]:
model2 = keras.Sequential([
    # first layer
    keras.layers.Dense(1024, input_shape=(40,)),
    keras.layers.LSTM(512, activation='relu'),
    keras.layers.LSTM(256, activation='relu'),
    # keras.layers.Dropout(0.3),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.GRU(64, activation='relu'),
    keras.layers.GRU(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(y_train.shape[1], activation='tanh')
])

model2.summary()

In [None]:
model2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
batch_size = 32
epochs = 200

In [None]:
model2.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))