In [1]:
#load libraries
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Input, LeakyReLU, Dropout, BatchNormalization, TimeDistributed
from tensorflow.keras.layers import Conv2DTranspose, ConvLSTM2D, Conv2D
from tensorflow.keras.optimizers import Adam

import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment
from PIL import Image
import subprocess
import glob
import shutil
import os
import cv2
import threading
import time

from tqdm.notebook import tqdm as log_progress

DATA_PATH = 'C:/Users/lukec/Videos/Test.mkv'

TEMP_PATH = 'temp/'
if os.path.isdir(TEMP_PATH):
    shutil.rmtree(TEMP_PATH)
os.mkdir(TEMP_PATH)

DATASET_PATH = 'Dataset/'

VIDEO_WIDTH = 128
VIDEO_HEIGHT = 128
FPS = 30 #recorded fps of the input data
INPUT_FRAME_COUNT = 60 #about 2 seconds of audio

GEN_MORE_SAMPLES = False #USE ONLY IF YOU HAVE A TON OF MEMORY AND PROCESSING POWER!
                         #(creates more training samples by offsetting 
                         #each training point by one frame instead of INPUT_FRAME_COUNT)
                         #This means that INPUT_FRAME_COUNT times more training samples will be generated
            
if GEN_MORE_SAMPLES:
    DATASET_PATH = 'Large Dataset/'

In [7]:
#prep data

#strip the audio from the training video using ffmpeg
command = f"ffmpeg -i {DATA_PATH} -ab 160k -ac 1 -ar 44100 -vn {TEMP_PATH}audio.wav"
subprocess.call(command, shell=True)

#load video frames from training video and dump them to a file
vidcap = cv2.VideoCapture(DATA_PATH)
success,image = vidcap.read()
count = 0
success = True
frames = []
while success:
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = np.reshape(image, (image.shape[0], image.shape[1], 1))
    frames.append(image)
    success,image = vidcap.read()
    count += 1
frames = np.array(frames)
frames = frames/255
np.save(f'{DATASET_PATH}videoframes.npy', frames)

#load audio from file, split into segments the same length as the input frames, save to temp folder
audio = AudioSegment.from_wav(f"{TEMP_PATH}audio.wav")
audio_seg_len = (INPUT_FRAME_COUNT/FPS)

num_audio_clips = int((audio.duration_seconds-audio_seg_len) / audio_seg_len)

print("Clipping audio...")
for i in log_progress(range(num_audio_clips)):
    if GEN_MORE_SAMPLES:
        for j in range(INPUT_FRAME_COUNT):
            t1 = (i * audio_seg_len) + j*(1/FPS) #Works in milliseconds
            t2 = t1 + audio_seg_len

            t1*=1000
            t2*=1000
            newAudio = audio[t1:t2]
            newAudio.export(f'{TEMP_PATH}{i}_{j}.wav', format="wav")
    else:
        t1 = i * audio_seg_len #Works in milliseconds
        t2 = t1 + audio_seg_len

        t1*=1000
        t2*=1000
        newAudio = audio[t1:t2]
        newAudio.export(f'{TEMP_PATH}{i}.wav', format="wav")

print("Converting audio to spectrograms...")

#run arss on all of those audio files
if os.path.isdir("cache/"):
    shutil.rmtree("cache/")
os.mkdir("cache/")
running = True

def arssThread(num, second_num=-1):
    if second_num==-1:
        command = f"arss {TEMP_PATH}{num}.wav cache/temp_{num}.png -q -min 27 -max 19912 -p {int(VIDEO_WIDTH/(INPUT_FRAME_COUNT/FPS))} -y {VIDEO_HEIGHT}"
        subprocess.call(command, shell=True)
    else:
        command = f"arss {TEMP_PATH}{num}_{second_num}.wav cache/temp_{num}.png -q -min 27 -max 19912 -p {int(VIDEO_WIDTH/(INPUT_FRAME_COUNT/FPS))} -y {VIDEO_HEIGHT}"
        subprocess.call(command, shell=True)

def arssThreadManager():
    global running
    
    if GEN_MORE_SAMPLES:
        for i in log_progress(range(num_audio_clips)):
            for j in range(INPUT_FRAME_COUNT):
                while len(threads) >= 70:
                    for t_id, t in enumerate(threads):
                        if not t.is_alive():
                            t.join()
                            threads.pop(t_id)
                    time.sleep(0.1)
                x = threading.Thread(target=arssThread, args=(i,j))
                x.start()
                threads.append(x)
    else:
        for i in log_progress(range(num_audio_clips)):
            while len(threads) >= 20:
                for t_id, t in enumerate(threads):
                    if not t.is_alive():
                        t.join()
                        threads.pop(t_id)
                time.sleep(0.1)
            x = threading.Thread(target=arssThread, args=(i,))
            x.start()
            threads.append(x)
        
    while len(threads) > 0:
        for t_id, t in enumerate(threads):
            if not t.is_alive():
                t.join()
                threads.pop(t_id)
        time.sleep(0.1)
    running = False
    
    
spects = [] #spectrogram array
threads = []

manager = threading.Thread(target=arssThreadManager)
manager.start()

while running:
    for file in glob.glob('cache/*.png'):
        try:
            #load temp image and save it to array
            img = cv2.imread(file)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
            img = np.reshape(img, (img.shape[0], img.shape[1], 1))
            spects.append(img)
            os.remove(file)
        except:
            pass
    
    time.sleep(0.1)
    
#clean up
manager.join()
shutil.rmtree('cache/')

spects = np.array(spects)
spects = spects/255
np.save(f'{DATASET_PATH}spectrograms.npy', spects)
print(spects.shape)
print("Done!")

Clipping audio...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=61.0), HTML(value='')))


Converting audio to spectrograms...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=61.0), HTML(value='')))


(61, 128, 128, 1)
Done!


In [2]:
#load pre-compiled data
frames = np.load(f'{DATASET_PATH}videoframes.npy')
spects = np.load(f'{DATASET_PATH}spectrograms.npy')

print(frames.shape)
print(spects.shape)

(3770, 128, 128, 1)
(61, 128, 128, 1)


In [2]:
#make model (version 1)
model = Sequential()
model.add(ConvLSTM2D(256, input_shape=(INPUT_FRAME_COUNT, VIDEO_WIDTH, VIDEO_HEIGHT, 1), kernel_size=(7,7), strides=(2,2), padding='same', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(ConvLSTM2D(256, kernel_size=(6,6), strides=(2,2), padding='same'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Conv2DTranspose(128, kernel_size=(5,5), strides=(4,4), padding='same'))
model.add(Conv2D(1, kernel_size=(5,5), padding='same'))

optimizer = Adam(learning_rate=0.0005)
model.compile(loss='binary_crossentropy', optimizer=optimizer)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d (ConvLSTM2D)    (None, 60, 64, 64, 256)   12896256  
_________________________________________________________________
dropout (Dropout)            (None, 60, 64, 64, 256)   0         
_________________________________________________________________
batch_normalization (BatchNo (None, 60, 64, 64, 256)   1024      
_________________________________________________________________
conv_lst_m2d_1 (ConvLSTM2D)  (None, 32, 32, 256)       18875392  
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 256)       0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 32, 256)       1024      
_________________________________________________________________
conv2d_transpose (Conv2DTran (None, 128, 128, 128)     8

In [4]:
#prep data
X = []
Y = []

if GEN_MORE_SAMPLES:
    for i in range(len(frames)):
        X.append(frames[i:i+INPUT_FRAME_COUNT])
        Y.append(spects[i])
else:
    for i in range(len(spects)):
        start_index = i * INPUT_FRAME_COUNT
        X.append(frames[start_index:start_index+INPUT_FRAME_COUNT])
        Y.append(spects[i])
    
X = np.array(X)
Y = np.array(Y)

print(X.shape)
print(Y.shape)

(61, 60, 128, 128, 1)
(61, 128, 128, 1)


In [None]:
#training
hist = model.fit(X, Y, epochs=1, batch_size=1, verbose=1)
model.save("v1.keras")
plt.plot(hist.history['loss'])

 1/61 [..............................] - ETA: 2:44:33 - loss: 0.5244