In [1]:
import os
import configparser

# Loading configurations
configParser = configparser.RawConfigParser()   
configFilePath = r'configuration.txt'
configParser.read(configFilePath)

insert_amd_env_vars =  int(configParser.get('COMMON', 'insert_amd_env_vars'))
HSA_OVERRIDE_GFX_VERSION =  configParser.get('COMMON', 'HSA_OVERRIDE_GFX_VERSION')
ROCM_PATH =  configParser.get('COMMON', 'ROCM_PATH')

if(insert_amd_env_vars != 0):
    os.environ["HSA_OVERRIDE_GFX_VERSION"] = HSA_OVERRIDE_GFX_VERSION
    os.environ["ROCM_PATH"] = ROCM_PATH


import subprocess
import os
import pathlib
import configparser
import sqlite3 as sl
import cv2
from pydub import AudioSegment
import math
import pickle
import shutil
import time
import multiprocessing
from multiprocessing import Process,Queue
import itertools
from threading import Thread
import soundfile as sf
from deepface import DeepFace
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)


start_time = time.time()    # To measure execution time in seconds


print("PLEASE EDIT configuration.txt BEFORE EXECUTION")
print(".wav files might be generated in path. The program will automatically delete them. If execuetion stops unexpectedly, please delete them yourself")



datasetPathVideo =  configParser.get('COMMON', 'datasetPathVideo')
datasetPathAudio =  configParser.get('extractAudio', 'datasetPathAudio')
p =  configParser.get('extractAudio', 'dbChunk')
ttwbdf =  int(configParser.get('extractAudio', 'time_to_wait_before_deleting_files'))
cuda =  int(configParser.get('COMMON', 'cuda'))
cpus =  int(configParser.get('COMMON', 'cpus'))
datasetPathDatabase =  configParser.get('COMMON', 'datasetPathDatabase') + '/dataset.db'
model_weights_path =  configParser.get('finetune_wav2vec2', 'model_weights_path')

# TODO dataset.db in configuration too

print("Video dataset at " + datasetPathVideo )
print("Number of cpus to use for multiprocessing : ", cpus)


  # Connection to databases
con = sl.connect(datasetPathDatabase,check_same_thread=False)
print('------------------- ABOUT TO START --------------------')

REQUIRED_SAMPLE_RATE = 16000
FACE_EMBEDDING_SIZE = 2622
AUDIO_MAX_LEN = 246000
NO_OF_VIDEOS = int(configParser.get('finetune_wav2vec2', 'no_of_videos')) 
BATCH_SIZE = int(configParser.get('finetune_wav2vec2', 'batch_size'))
save_freq = int(configParser.get('finetune_wav2vec2', 'save_freq'))
no_of_epochs = int(configParser.get('finetune_wav2vec2', 'no_of_epochs'))


def read_audio_file(file_path):
  with open(file_path, "rb") as f:
      audio_wave, sample_rate = sf.read(f)
  if sample_rate != REQUIRED_SAMPLE_RATE:
      raise ValueError(
          f"sample rate (={sample_rate}) of your files must be {REQUIRED_SAMPLE_RATE}"
      )
  return audio_wave

 
def extractAudio(row):
    absPathVideo = row[0][1]   # for this one video

    absPathAudio = y = absPathVideo.replace(datasetPathVideo,datasetPathAudio)  # for this one audio
    absPathAudio = os.path.splitext(absPathAudio)[0]
    absPathAudio_w = absPathAudio   # without the end
    absPathAudio = absPathAudio + "_audio.wav"  # full path to extracted audio from the video

    #Create Directory
    pathlib.Path(os.path.dirname(absPathAudio)).mkdir(parents=True, exist_ok=True) 

    # Extract audio monochannel and with 16khz and put it in absPathAudio
    command = "ffmpeg -nostats -loglevel 0 -y -i '" + absPathVideo + "' -acodec pcm_s16le -ab 160k -ac 1 -ar 16000 -vn '" + absPathAudio + "'"
    subprocess.call(command, shell=True)


    # Get original duration of video
    audio = AudioSegment.from_file(absPathVideo)
    audio_length_og = math.floor(audio.duration_seconds)
    #print(audio_length_og)
    


    # Will either truncate or loop the original video to reach audio_length (3,6,12 or 24)
    audio_length_list = [6,12,24]
    for audio_length in audio_length_list:
        path_var_len_audio =  absPathAudio_w + "audio" + str(audio_length) + "s.wav"    # path to the variable length audio
        path_var_len_audio_temp =  absPathAudio_w + "audio_temp" + str(audio_length) + "s.wav"  # path to a temp version of the variable length audio

        if(audio_length_og > audio_length):
            # Truncate    

            command = "ffmpeg -nostats -loglevel 0 -y -ss 0 -t "+str(audio_length)+" -i \"" + absPathAudio + "\" \"" + path_var_len_audio + "\""
            subprocess.call(command, shell=True)


        else:
            # Loop then truncaate
            #print("lesa")
            twoDigitLenStr = f"{audio_length:02}"
            #print(twoDigitLenStr)
            command = "ffmpeg -nostats -loglevel 0 -y -stream_loop -1 -i '" + absPathAudio + "' -t \"00:00:"+twoDigitLenStr+".000\" -codec:a \"aac\" -f \"wav\" -c copy '"+ path_var_len_audio_temp + "'"
            subprocess.call(command, shell=True)
            command = "ffmpeg -nostats -loglevel 0 -y -ss 0 -t "+str(audio_length)+" -i \"" + path_var_len_audio_temp + "\" \"" + path_var_len_audio + "\""
            subprocess.call(command, shell=True)



            # Will delete those files after a little bit
        ftd = [absPathAudio,path_var_len_audio,os.path.basename(path_var_len_audio),path_var_len_audio_temp]
        audio_wave = read_audio_file(path_var_len_audio)
        
        tDelete = Thread(target=delFiles, args=(ftd,))   # spawn a process
        tDelete.start()
    return audio_wave
        
           




            
# Function to delete audio temp files
def delFiles(filesToDelete):
    time.sleep(ttwbdf)  # wait a bit
    for file in filesToDelete:  
        try:
            os.remove(file)
        except OSError:
            pass
        


from random import randint

def get_video(offset):
    
    try:
        #print('offset:' + str(offset))
        #print('offset:' + str(offset))
        data = con.execute("""SELECT V.ID, V.VIDEO_PATH, F.FACE_PATH FROM VIDEO V 
                        INNER JOIN FACE F ON F.ID = V.ID
                        LIMIT 1 OFFSET """ + str(offset))


        
        #print(data.fetchall())
        dataGotten = data.fetchall()
        if (len(dataGotten)) == 0:
            raise ValueError('No video was fetched')

        #print(dataGotten[0][0])
        audio_wave = extractAudio(dataGotten)
        
        embedding_objs = DeepFace.represent(dataGotten[0][2],enforce_detection=False)
        return tf.constant(audio_wave[:246000], dtype=tf.float64),tf.constant(embedding_objs[0]['embedding'], dtype=tf.float64)
    except Exception as e:
        print('Error getting video, retrying ...:' + str(offset))
        return get_video(randint(1, NO_OF_VIDEOS - 1))


    








2024-03-09 16:09:16.102312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-09 16:09:16.139096: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


PLEASE EDIT configuration.txt BEFORE EXECUTION
.wav files might be generated in path. The program will automatically delete them. If execuetion stops unexpectedly, please delete them yourself
Video dataset at /media/gamal/Passport/Datasets/VoxCeleb2/Voxceleb2Video
Number of cpus to use for multiprocessing :  8
------------------- ABOUT TO START --------------------


2024-03-09 16:09:16.915781: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:16.930459: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:16.930567: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
import numpy as np



def data_generator(n):

    

    # initialize counter
    value = 0

    
    result_array = np.arange(1, NO_OF_VIDEOS - 1)
    np.random.shuffle(result_array)

    # loop until counter is less than n
    while value < n:
        
        yield get_video(result_array[value])

        # increment the counter
        value += 1



for x in data_generator(100):
    print(x)

In [3]:
import os

import tensorflow as tf
import tensorflow_hub as hub
from wav2vec2 import Wav2Vec2Config

config = Wav2Vec2Config()

print("TF version:", tf.__version__)


TF version: 2.11.0


In [4]:
pretrained_layer = hub.KerasLayer("https://tfhub.dev/vasudevgupta7/wav2vec2/1", trainable=True)


2024-03-09 16:09:18.335981: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:18.336119: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:18.336186: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:19.203491: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-03-09 16:09:19.203618: I tensorflow/compiler/xla/stream_executo

In [5]:
inputs = tf.keras.Input(shape=(AUDIO_MAX_LEN,))
hidden_states = pretrained_layer(inputs)
pooled_output = tf.keras.layers.AveragePooling1D(pool_size=50)(hidden_states)
flatten_output = tf.keras.layers.Flatten()(pooled_output)
outputs = tf.keras.layers.Dense(FACE_EMBEDDING_SIZE,activation='linear')(flatten_output)


In [6]:

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model(tf.random.uniform(shape=(BATCH_SIZE, AUDIO_MAX_LEN)))
model.summary()


2024-03-09 16:09:22.911042: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 246000)]          0         
                                                                 
 keras_layer (KerasLayer)    (None, 768, 768)          94371712  
                                                                 
 average_pooling1d (AverageP  (None, 15, 768)          0         
 ooling1D)                                                       
                                                                 
 flatten (Flatten)           (None, 11520)             0         
                                                                 
 dense (Dense)               (None, 2622)              30208062  
                                                                 
Total params: 124,579,774
Trainable params: 124,579,774
Non-trainable params: 0
_______________________________________________

In [7]:
from wav2vec2 import CTCLoss

LEARNING_RATE = 5e-5


#loss_fn = CTCLoss(config, (BATCH_SIZE, AUDIO_MAX_LEN), division_factor=BATCH_SIZE)
loss_fn = tf.keras.losses.MeanAbsoluteError()
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)


In [8]:
output_signature = (
    tf.TensorSpec(shape=(None),  dtype=tf.float64),
    tf.TensorSpec(shape=(None), dtype=tf.float64),
)

dataset = tf.data.Dataset.from_generator(data_generator, args=[NO_OF_VIDEOS], 
                                         output_signature=output_signature)




In [9]:
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)



In [10]:
num_train_batches = int(NO_OF_VIDEOS / BATCH_SIZE)
train_dataset = dataset.take(num_train_batches)


In [11]:
try:
    model.load_weights(model_weights_path)
    print('Loaded weights')
except:
    print('Could not load weights, using random ones.')

Loaded weights


In [12]:
model.compile(optimizer, loss=loss_fn)


In [13]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath= model_weights_path,
    save_weights_only=True,
    save_freq = save_freq,

    )


In [14]:
history = model.fit(train_dataset, epochs=no_of_epochs,callbacks=[model_checkpoint_callback])
history.history

Epoch 1/10


    148/Unknown - 1530s 10s/step - loss: 0.0072Error getting video, retrying ...:847439
    448/Unknown - 5185s 12s/step - loss: 0.0071

KeyboardInterrupt: 