In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import librosa
import json
from pydub import AudioSegment
from pathlib import PurePath
import os




In [2]:
df = pd.read_csv('./dataset/test-clean.csv')
new_df = df[['subset', 'reader_id', 'chapter_id']].to_numpy()

filePaths = []

for el in new_df:
    subset = el[0].strip()
    readerid = el[1]
    chapterid = el[2]
    folderPath = f'./dataset/{subset}/LibriSpeech/{subset}/{readerid}/{chapterid}/'
    textFile = f'{readerid}-{chapterid}.trans.txt'
    # print(folderPath + textFile)
    with open(folderPath + textFile, 'r') as f:
        lines = f.readlines()
        for line in lines:
            filename = line.split(' ')[0]
            filePaths.append(folderPath + filename + '.flac')

In [3]:
total = len(filePaths)
total

2620

In [4]:
filePaths_wav = [el.replace('.flac', '.wav') for el in filePaths]
filePaths_json = [el.replace('.flac', '.json') for el in filePaths]

In [5]:
features = []

for idx in range(0, len(filePaths_wav)):
    jsonFile = open(filePaths_json[idx], 'r')
    jsonData = json.load(jsonFile)
    # audio = AudioSegment.from_wav(filePaths_wav[idx])
    timestamps = jsonData['mouthCues']
    for i in range(0, len(timestamps)):
        # start = timestamps[i]['start']*1000
        # end = timestamps[i]['end']*1000
        # newAudio = audio[start:end]
        data, samplingRate = librosa.load(filePaths_wav[idx].replace('.wav', '') + '/' + str(i) + '.wav')
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=samplingRate, n_mfcc=40, n_fft=256, hop_length=64).T, axis=0)
        # mfccs = librosa.feature.mfcc(y=data, sr=samplingRate, n_mfcc=40)
        features.append([mfccs, timestamps[i]['value']])
    
    if(100 * (idx+1)/len(filePaths_wav) % 5 == 0):
        print(f"{idx+1} ({100 * (idx+1)/len(filePaths_wav)} %) files processed")

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


131 (5.0 %) files processed
262 (10.0 %) files processed
393 (15.0 %) files processed
524 (20.0 %) files processed
655 (25.0 %) files processed
786 (30.0 %) files processed
917 (35.0 %) files processed
1048 (40.0 %) files processed
1179 (45.0 %) files processed
1310 (50.0 %) files processed
1441 (55.0 %) files processed
1572 (60.0 %) files processed
1703 (65.0 %) files processed
1834 (70.0 %) files processed
1965 (75.0 %) files processed
2096 (80.0 %) files processed
2227 (85.0 %) files processed
2358 (90.0 %) files processed
2489 (95.0 %) files processed
2620 (100.0 %) files processed


In [6]:
dataset = pd.DataFrame(features, columns=['features', 'mouthCue'])
dataset.head()

Unnamed: 0,features,mouthCue
0,"[-805.66187, 47.363415, -81.06073, 50.632145, ...",X
1,"[-713.58356, 29.172215, -66.59457, 73.78922, -...",B
2,"[-656.05664, 27.86562, -81.64793, 103.50443, -...",A
3,"[-539.1748, 84.53672, -111.58639, 38.384792, -...",C
4,"[-587.18823, 25.168352, -52.31743, 35.61607, -...",B


In [7]:
dataset['mouthCue'].value_counts()

mouthCue
B    38132
C    26101
A    11711
E     9594
F     8475
X     7862
G     5604
D     3332
H     2784
Name: count, dtype: int64

In [8]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto')
X_sm, y_sm = smote.fit_resample(dataset['features'].tolist(), dataset['mouthCue'].tolist())

In [9]:
y_sm = np.array(y_sm)

In [10]:
X_sm = np.array(X_sm)

In [11]:
X_sm.shape

(343188, 40)

In [12]:
pd.DataFrame(y_sm).value_counts()

A    38132
B    38132
C    38132
D    38132
E    38132
F    38132
G    38132
H    38132
X    38132
Name: count, dtype: int64

In [13]:
dataset.features[3].shape

(40,)

In [14]:
# X = np.array(dataset.features.tolist())
# # X = dataset.features.to_numpy()
# y = np.array(dataset.mouthCue.tolist())

X = X_sm
y = y_sm

In [15]:
y.shape

(343188,)

In [16]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [17]:
mouthCues = pd.get_dummies(y).keys().to_list()
mouthCues

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'X']

In [18]:
y_new = np.array(pd.get_dummies(y))

In [19]:
y_new.shape

(343188, 9)

In [20]:
X.shape

(343188, 40)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.3, random_state = 42, stratify=y_new)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state = 42, stratify=y_test)

In [22]:
X_test.shape, X_train.shape, X_val.shape

((51479, 40), (240231, 40), (51478, 40))

In [23]:
y_test.shape, y_train.shape, y_val.shape

((51479, 9), (240231, 9), (51478, 9))

In [24]:
num_classes = y_train.shape[1]
# inputShape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
inputShape = (X_train.shape[1],1)

In [25]:
# X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1], 1))
# X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1], 1))

In [26]:
model = keras.Sequential()
# model.add(keras.layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='tanh', input_shape=inputShape))
# model.add(keras.layers.MaxPooling1D(pool_size=2))
# # model.add(keras.layers.BatchNormalization())
# model.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation='sigmoid'))
# model.add(keras.layers.MaxPooling1D(pool_size=2))
# model.add(keras.layers.BatchNormalization())
# model.add(keras.layers.Flatten())
# model.add(keras.layers.Dense(64, activation='relu'))
# # model.add(keras.layers.Dropout(0.5))
# model.add(keras.layers.Dense(32, activation='sigmoid'))
# # model.add(keras.layers.Dropout(0.25))
# model.add(keras.layers.Dense(16, activation='relu'))
# model.add(keras.layers.Dense(num_classes, activation='tanh'))
model.add(keras.layers.InputLayer(input_shape=inputShape))
model.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=2))
# model.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
# model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Flatten())
# model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(512, activation='relu'))
model.add(keras.layers.Dense(256, activation='relu'))
# model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Reshape((1, 256)))
model.add(keras.layers.GRU(256, return_sequences=True, activation='relu'))
model.add(keras.layers.Dense(128, activation='relu'))
# model.add(keras.layers.GRU(128, return_sequences=True, activation='relu'))
model.add(keras.layers.Dense(64, activation='sigmoid'))
# model.add(keras.layers.LSTM(64, activation='relu', return_sequences=True))
# model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Reshape((32,)))
model.add(keras.layers.Dense(num_classes, activation='softmax'))

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 38, 32)            128       
                                                                 
 max_pooling1d (MaxPooling1  (None, 19, 32)            0         
 D)                                                              
                                                                 
 batch_normalization (Batch  (None, 19, 32)            128       
 Normalization)                                                  
                                                                 
 flatten (Flatten)           (None, 608)               0         
                                                                 
 dense (Dense)               (None, 512)               311808    
                                                                 
 dense_1 (Dense)             (None, 256)              

In [27]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')




In [35]:
batch_size = 64
epochs = 15

In [36]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x27389f72f50>

In [37]:
model.evaluate(X_test, y_test)



[0.6686773896217346, 0.808951199054718]

In [38]:
[el.split('.')[0] for el in os.listdir(f'./saved_models/')]

['1', '2', '3', '4', '5', '6', '7', '8']

In [39]:
modelVersion = 0
if(os.path.exists('./saved_models/')):
    modelVersion = max([0] if len([int(i) for i in [el.split('.')[0] for el in os.listdir(f'./saved_models/')]]) == 0 else [int(i) for i in [el.split('.')[0] for el in os.listdir(f'./saved_models/')]]) + 1
else:
    os.mkdir('./saved_models')

model.save(f"./saved_models/{modelVersion}.keras")

In [40]:
predictions_temp = model.predict(np.array([X_test[0]]))
predictions = model.predict(X_test)



In [41]:
predicted_mouthCues = [el.argmax() for el in predictions]
predicted_mouthCues = [mouthCues[el] for el in predicted_mouthCues]