In [1]:
# select a GPU
import os
os.environ["CUDA_DEVICE_ORDER/"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
#imports 
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import librosa
import soundfile as sound

import keras
import tensorflow
from keras.optimizers import SGD

from DCASE2019_network import model_resnet
from DCASE2019_improvised_network import model_resnet_new
from DCASE_training_functions import LR_WarmRestart, MixupGenerator

print("Librosa version = ",librosa.__version__)
print("Pysoundfile version = ",sound.__version__)
print("keras version = ",keras.__version__)
print("tensorflow version = ",tensorflow.__version__)

2024-07-10 17:57:49.603056: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Librosa version =  0.10.1
Pysoundfile version =  0.12.1
keras version =  3.3.3
tensorflow version =  2.16.1


In [3]:
WhichTask = '1a'
# WhichTask = '1b'
#WhichTask = '1c'

if WhichTask =='1a':
    ThisPath = '/work/aistwal/dataset_tau2019/extracted-files/TAU-urban-acoustic-scenes-2019-development/'
    # ThisPath = '../TAU-urban-acoustic-scenes-2019-development/'
    TrainFile = ThisPath + 'evaluation_setup/fold1_train.csv'
    ValFile = ThisPath + 'evaluation_setup/fold1_evaluate.csv'
    sr = 48000
    num_audio_channels = 2
elif WhichTask =='1b':
    ThisPath = '../Task1b/'
    TrainFile = ThisPath + 'evaluation_setup/fold1_train.csv'
    ValFile = ThisPath + 'evaluation_setup/fold1_evaluate.csv'
    sr = 44100
    num_audio_channels = 1
elif WhichTask =='1c':
    ThisPath = '../Task1c/'
    TrainFile = ThisPath + 'evaluation_setup/fold1_train.csv'
    sr = 44100
    num_audio_channels = 1
    
SampleDuration = 10

#log-mel spectrogram parameters
NumFreqBins = 128
NumFFTPoints = 2048
HopLength = int(NumFFTPoints/2)
NumTimeBins = int(np.ceil(SampleDuration*sr/HopLength))

#training parameters
max_lr = 0.1
# batch_size = 32
batch_size = 16
num_epochs = 510
mixup_alpha = 0.4
crop_length = 400

In [4]:
#load filenames and labels
dev_train_df = pd.read_csv(TrainFile,sep='\t', encoding='ASCII')
dev_val_df = pd.read_csv(ValFile,sep='\t', encoding='ASCII')
wavpaths_train = dev_train_df['filename'].tolist()
wavpaths_val = dev_val_df['filename'].tolist()
y_train_labels =  dev_train_df['scene_label'].astype('category').cat.codes.values
y_val_labels =  dev_val_df['scene_label'].astype('category').cat.codes.values

ClassNames = np.unique(dev_train_df['scene_label'])
NumClasses = len(ClassNames)

y_train = keras.utils.to_categorical(y_train_labels, NumClasses)
y_val = keras.utils.to_categorical(y_val_labels, NumClasses)

In [5]:
ClassNames

array(['airport', 'bus', 'metro', 'metro_station', 'park',
       'public_square', 'shopping_mall', 'street_pedestrian',
       'street_traffic', 'tram'], dtype=object)

In [5]:
y_train[0] #One hot encoding the categories or scene classes

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [6]:
# #load wav files and get log-mel spectrograms, deltas, and delta-deltas
# def deltas(X_in):
#     X_out = (X_in[:,:,2:,:]-X_in[:,:,:-2,:])/10.0
#     X_out = X_out[:,:,1:-1,:]+(X_in[:,:,4:,:]-X_in[:,:,:-4,:])/5.0
#     return X_out

LM_train = np.zeros((len(wavpaths_train),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths_train)):
    stereo,fs = sound.read(ThisPath + wavpaths_train[i],stop=SampleDuration*sr)
    for channel in range(num_audio_channels):
        if len(stereo.shape)==1:
            stereo = np.expand_dims(stereo,-1)
        LM_train[i,:,:,channel]= librosa.feature.melspectrogram(y=stereo[:,channel], 
                                       sr=sr,
                                       n_fft=NumFFTPoints,
                                       hop_length=HopLength,
                                       n_mels=NumFreqBins,
                                       fmin=0.0,
                                       fmax=sr/2,
                                       htk=True,
                                       norm=None)

LM_train = np.log(LM_train+1e-8)
# LM_deltas_train = deltas(LM_train)
# LM_deltas_deltas_train = deltas(LM_deltas_train)
# LM_train = np.concatenate((LM_train[:,:,4:-4,:],LM_deltas_train[:,:,2:-2,:],LM_deltas_deltas_train),axis=-1)

In [7]:
LM_train.shape

(9185, 128, 469, 2)

In [13]:
LM_val = np.zeros((len(wavpaths_val),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths_val)):
    stereo,fs = sound.read(ThisPath + wavpaths_val[i],stop=SampleDuration*sr)
    for channel in range(num_audio_channels):
        if len(stereo.shape)==1:
            stereo = np.expand_dims(stereo,-1)
        LM_val[i,:,:,channel]= librosa.feature.melspectrogram(y=stereo[:,channel], 
                                       sr=sr,
                                       n_fft=NumFFTPoints,
                                       hop_length=HopLength,
                                       n_mels=NumFreqBins,
                                       fmin=0.0,
                                       fmax=sr/2,
                                       htk=True,
                                       norm=None)

LM_val = np.log(LM_val+1e-8)
LM_deltas_val = deltas(LM_val)
LM_deltas_deltas_val = deltas(LM_deltas_val)
LM_val = np.concatenate((LM_val[:,:,4:-4,:],LM_deltas_val[:,:,2:-2,:],LM_deltas_deltas_val),axis=-1)

In [14]:
LM_val.shape

(4185, 128, 461, 6)

In [11]:
# #SAVE THE NUMPY ARRAYS SINCE THE PROCESSING OF ALL AUDIOS TAKE TIME
# from numpy import save, load

# save('train.npy', LM_train)
# save('val.npy', LM_val)

In [6]:
# from numpy import save, load
# #Loading the train and val numpy arrays
# LM_train = load('train.npy')
# LM_val = load('val.npy')

# print("Verifying shapes of loaded numpy arrays: ", LM_train.shape, LM_val.shape)

# #Train : (9185, 128, 461, 6), Val : (4185, 128, 461, 6)

Verifying shapes of loaded numpy arrays:  (9185, 128, 461, 6) (4185, 128, 461, 6)


In [15]:
# #create and compile the model
# model = model_resnet(NumClasses,
#                      input_shape =[NumFreqBins,None,3*num_audio_channels], 
#                      num_filters =24,
#                      wd=1e-3)
model = model_resnet_new(NumClasses,
                     input_shape =[NumFreqBins,None,3*num_audio_channels])
model.compile(loss='categorical_crossentropy',
              optimizer =SGD(learning_rate=max_lr,decay=0, momentum=0.9, nesterov=False),
              metrics=['accuracy'])

model.summary()

2024-07-02 13:02:22.508269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 39317 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:41:00.0, compute capability: 8.6


In [None]:
# dot_img_file = 'asc_resnet_model.png'
# keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True, dpi=64)

In [None]:
# #for implementing warm restarts in learning rate
# class LR_WarmRestartNew(keras.callbacks.Callback):
    
#     def __init__(self,nbatch,initial_lr,min_lr,epochs_restart,Tmult):
#         self.initial_lr = initial_lr
#         self.min_lr = min_lr
#         self.epochs_restart = epochs_restart
#         self.nbatch = nbatch
#         self.currentEP=0
#         self.startEP=0
#         self.Tmult=Tmult
        
#     def on_epoch_begin(self, epoch, logs={}):
#         if epoch+1<self.epochs_restart[0]:
#             self.currentEP = epoch
#         else:
#             self.currentEP = epoch+1
            
#         if np.isin(self.currentEP,self.epochs_restart):
#             self.startEP=self.currentEP
#             self.Tmult=2*self.Tmult
        
#     def on_epoch_end(self, epochs, logs={}):
#         lr = K.get_value(self.model.optimizer.lr)
#         print ('\nLearningRate:{:.6f}'.format(lr))
    
#     def on_batch_begin(self, batch, logs={}):
#         pts = self.currentEP + batch/self.nbatch - self.startEP
#         decay = 1+np.cos(pts/self.Tmult*np.pi)
#         lr = self.min_lr+0.5*(self.initial_lr-self.min_lr)*decay
#         # K.set_value(self.model.optimizer.lr,lr)
#         self.model.optimizer.learning_rate.assign(lr)

In [16]:
#set learning rate schedule
lr_scheduler = LR_WarmRestart(nbatch=np.ceil(LM_train.shape[0]/batch_size), Tmult=2,
                              initial_lr=max_lr, min_lr=max_lr*1e-4,
                              epochs_restart = [3.0, 7.0, 15.0, 31.0, 63.0,127.0,255.0,511.0]) 
callbacks = [lr_scheduler]

#create data generator
TrainDataGen = MixupGenerator(LM_train, 
                              y_train, 
                              batch_size=batch_size,
                              alpha=mixup_alpha,
                              crop_length=crop_length)()

for X, y in TrainDataGen:
    print(type(X),type(y[0]))
    print(X.shape,y[0].shape)
    print(y[0])
    break
type(TrainDataGen)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(16, 128, 400, 6) (10,)
[0.         0.61300043 0.         0.         0.         0.
 0.         0.         0.         0.38699957]


generator

In [17]:
# LM_val

In [18]:
# y_val

In [19]:
#train the model
history = model.fit(x = TrainDataGen,
                    validation_data=(LM_val, y_val),
                    epochs=num_epochs, 
                    verbose=1, 
                    callbacks=callbacks,
                    steps_per_epoch=int(np.ceil(LM_train.shape[0]/batch_size))) 

Epoch 1/2


I0000 00:00:1719918151.175293 1744535 service.cc:145] XLA service 0x7fd990040e30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1719918151.175369 1744535 service.cc:153]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2024-07-02 13:02:31.416867: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-02 13:02:32.062277: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8906
2024-07-02 13:04:16.837951: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 39.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-02 13:04:17.838152: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Tryi

[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.3492 - loss: 50.4292

2024-07-02 13:25:34.077662: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng38{k2=0,k13=2,k14=3,k18=0,k22=0,k23=0} for conv (f32[32,1536,64,58]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,768,66,117]{3,2,1,0}, f32[1536,768,3,3]{3,2,1,0}), window={size=3x3 stride=1x2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...
2024-07-02 13:25:34.093743: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.016304104s
Trying algorithm eng38{k2=0,k13=2,k14=3,k18=0,k22=0,k23=0} for conv (f32[32,1536,64,58]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,768,66,117]{3,2,1,0}, f32[1536,768,3,3]{3,2,1,0}), window={size=3x3 stride=1x2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", backend_config={"oper


LearningRate:0.050142
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1980s[0m 3s/step - accuracy: 0.3493 - loss: 50.3992 - val_accuracy: 0.3622 - val_loss: 13.0864
Epoch 2/2
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5210 - loss: 10.9372
LearningRate:0.000010
[1m575/575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1447s[0m 3s/step - accuracy: 0.5210 - loss: 10.9353 - val_accuracy: 0.6179 - val_loss: 8.5928


In [20]:
model.save('DCASE_' + WhichTask + '_Task_development_1_test.h5')

