In [1]:
import numpy as np
import librosa
import soundfile as sf
from pydub import AudioSegment
import pandas as pd



# Function Definitions

In [2]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

def conv_to_mono(inFile,outFile):
    sound = AudioSegment.from_wav(inFile)
    sound = sound.set_channels(1)
    sound.export(outFile, format="wav")
    return

def store_to_npy(inFile, outFile):
    x, sr = librosa.load(inFile, sr=16000)
    assert outFile[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outFile, x)
    return

def read_from_npy(inFile):
    X = np.load(inFile)
    return X

def write_npy_to_wav(x, outFile):
    sf.write(outFile, x, 16000, 'PCM_24')
    return

def start_time(lamb_1= 1.71111):
    t_start = 0
    while t_start < 1:
        t_start = np.random.exponential(1/lamb_1)
    return t_start

def end_time(lamb_2 = 1.13264):
    t_end = 0
    while t_end < 1 or t_end > 5:
        t_end = np.random.exponential(1/lamb_2)
    return t_end

def time_to_index(t,sr):
    index = int(np.ceil(t*sr))
    return index

def add_noise(input_array, noise_deviation):
    noise = np.random.normal(0, noise_deviation, input_array.shape)
    output_array = input_array+noise
    return output_array

def convert_to_segments(source_array, dest_dir, name,class_name):
    column1 = ['filename','onset','offset','class']
    df =pd.DataFrame(columns = column1)
    mLen = source_array.shape[0]
    source_count = 0
    file_num = 1
    sr = 16000
    time_duration = 10
    seed = 0
    np.random.seed(seed)
    N = sr*time_duration
    noise_stdd = 0.001
    # generating the audio files of music only
    while source_count < mLen:
        #print("File Number - ",file_num)
        t1 = 0
        start =  0
        end = 0
        samples_to_write = np.zeros(N)
        #segment = 0
        while t1<10:
            #segment = segment + 1
            start = t1 + start_time()
            if start < 10:
                end = start + end_time()
                if end < 10:
                    t1 = end
                else:
                    t1 = 10
                    
                n_samples = int(np.ceil((t1 - start) *sr))
                t1_to_index = time_to_index(t1,sr) 
                #print("Segment number : ",segment)
                #print("start :", start,", end :", end,", difference :", t1-start,", n_samples :", n_samples,"\nstart_sample :", t1_to_index,", source samples used : ",source_count,", total source samples :", mLen)
                #print("start :", start,"end :", end,"difference :", t1-start,"n_samples :", n_samples,"start_sample :", t1_to_index)
                if(source_count+n_samples < mLen):
                    samples_to_write[time_to_index(start,sr):time_to_index(start,sr)+n_samples-1] = source_array[source_count:(source_count+n_samples-1)]
                    df = df.append({'filename':name+str(file_num),'onset':start,'offset':t1,'class':class_name}, ignore_index = True)
                source_count = source_count+n_samples
                #print("Segment number : ",segment)
                #print("start :", start,", end :", end,", difference :", t1-start,", n_samples :", n_samples,"\nstart_sample :", t1_to_index,", source samples used : ",source_count,", total source samples :", mLen)
                #print("source samples used : ",source_count,"total source samples :", mLen)
            else:
                t1 = 10
        samples_to_write = add_noise(samples_to_write, noise_stdd)
        write_npy_to_wav(samples_to_write,dest_dir+name+str(file_num)+".wav")
        file_num = file_num+1
        if(file_num >= 600):
            df.to_csv(dest_dir+'labels.csv', index = False)
            print("Some error has occured, trying to write to too many files.")
            return file_num-1
        
    df.to_csv(dest_dir+'labels.csv', index = False)   
    print("Completed. Number of files  written to -", file_num-1)
    return file_num-1

def convert_to_mixed_segments(source_array1, source_array2, dest_dir, name):
    column1 = ['filename','onset','offset','class']
    df =pd.DataFrame(columns = column1)
    mLen = source_array1.shape[0]
    sLen = source_array2.shape[0]
    source_count1 = 0
    source_count2 = 0
    file_num = 1
    sr = 16000
    time_duration = 10
    seed = 0
    np.random.seed(seed)
    N = sr*time_duration
    noise_stdd = 0.001
    # generating the audio files of music and speech
    while source_count1 < mLen or source_count2 < sLen:
        #print("File Number - ",file_num)
        t1 = 0
        start =  0
        end = 0
        samples_to_write = np.zeros(N)
        #segment = 0
        while t1<10:
            #segment = segment + 1
            start = t1 + start_time()
            if start < 10:
                end = start + end_time()
                if end < 10:
                    t1 = end
                else:
                    t1 = 10
                    
                n_samples = int(np.ceil((t1 - start) *sr))
                t1_to_index = time_to_index(t1,sr)
                if np.round(np.random.rand())==0:
                    class_name = 'music'
                #print("Segment number : ",segment)
                #print("start :", start,", end :", end,", difference :", t1-start,", n_samples :", n_samples,"\nstart_sample :", t1_to_index,", source samples used : ",source_count,", total source samples :", mLen)
                #print("start :", start,"end :", end,"difference :", t1-start,"n_samples :", n_samples,"start_sample :", t1_to_index)
                    if(source_count1+n_samples < mLen):
                        samples_to_write[time_to_index(start,sr):time_to_index(start,sr)+n_samples-1] = source_array1[source_count1:(source_count1+n_samples-1)]
                        df = df.append({'filename':name+str(file_num),'onset':start,'offset':t1,'class':class_name}, ignore_index = True)
                    source_count1 = source_count1+n_samples
                #print("Segment number : ",segment)
                #print("start :", start,", end :", end,", difference :", t1-start,", n_samples :", n_samples,"\nstart_sample :", t1_to_index,", source samples used : ",source_count,", total source samples :", mLen)
                #print("source samples used : ",source_count,"total source samples :", mLen)
                else:
                    class_name = 'speech'
                    if(source_count2+n_samples < sLen):
                        samples_to_write[time_to_index(start,sr):time_to_index(start,sr)+n_samples-1] = source_array2[source_count2:(source_count2+n_samples-1)]
                        df = df.append({'filename':name+str(file_num),'onset':start,'offset':t1,'class':class_name}, ignore_index = True)
                    source_count2 = source_count2+n_samples   
            else:
                t1 = 10
        samples_to_write = add_noise(samples_to_write, noise_stdd)
        write_npy_to_wav(samples_to_write,dest_dir+name+str(file_num)+".wav")
        file_num = file_num+1
        if(file_num >= 600):
            df.to_csv(dest_dir+'labels.csv', index = False)
            print("Some error has occured, trying to write to too many files.")
            return file_num-1
    df.to_csv(dest_dir+'labels.csv', index = False)
    print("Completed. Number of files written to -", file_num-1)
    return file_num-1

def conv_to_frame(t,H):
    N = int(np.ceil((t*H)/10))
    return N

def convToLinear(X):
    x = np.exp(X/10)
    return x
    
def stftToLogMel(X):
    # converts from log power spectrogram to MFCC
    fs = 16000
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X_linear = np.exp(X/10)
    X_logMel = librosa.power_to_db(np.square(np.abs(librosa.feature.melspectrogram(y=None,S = X_linear ,sr =fs, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256 ))))
    #X_MFCC = librosa.feature.mfcc(y=None, sr=fs, S=X_melSpect, n_mfcc=20)
    return X_logMel

def wavToSTFT(inFile, outFile):
    fs = 16000
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    x,sr = readAudio(inFile) 
    X = calc_spec(x)
    saveSpectrogram(X, outFile)
    return

def createSTFT(dir_wavs, filenum, dir_specs):
    i = 1
    while i<= filenum:
        wavToSTFT(dir_wavs+str(i)+'.wav', dir_specs+str(i)+'.npy')
        i=i+1
        
    print("Completed conversion to spectrograms")
    return

def reshape_3D_mat(x):
    count = 0
    Nrows = x.shape[0]
    new=[]
    while count < Nrows:
        
        r_count = 0
        while r_count < x.shape[2]:
            new.append(x[count,:,r_count])
            r_count = r_count + 1
        count = count+1
    new = np.array(new)
    return new


def createDataSpeech(df, filenum, file):
    temp = df.iloc[0]
    Y = []
    i = 0
    k=0
    X = []
    #print(df.shape[0])
    while i <= filenum-1 :
        X_train = read_from_npy(file+str(i+1)+".npy")
        X_train = stftToLogMel(X_train)
        #print(X_train.shape)
        H = X_train.shape[1]
        X.append(X_train)
        buffer = np.array([[0,0,1]]*H).T
        #print(y_train.shape)
        filename = temp['filename']
        while filename == temp['filename']:
            #print(Y)
            filename = temp['filename']
            t_start = temp['onset']
            t_end = temp['offset']
            #print("Number of frames",conv_to_frame(t_end,H)-conv_to_frame(t_start,H))
            #print(temp)
            for j in range(conv_to_frame(t_start,H),conv_to_frame(t_end,H)):
                buffer[1,j] = 1
                buffer[2,j] = 0
            if k<df.shape[0]-1:
                k=k+1
                #print(k)
                temp = df.iloc[k]
            else:
                break
        i=i+1
        Y.append(buffer)
        
    X = np.array(X)
    Y = np.array(Y)
    #X = reshape_3D_mat(X)
    #Y = reshape_3D_mat(Y)
    return X, Y

def createDataMusic(df, filenum, file):
    temp = df.iloc[0]
    Y = []
    i = 0
    k=0
    X = []
    #print(df.shape[0])
    while i <= filenum-1 :
        X_train = read_from_npy(file+str(i+1)+".npy")
        X_train = stftToLogMel(X_train)
        #print(X_train.shape)
        H = X_train.shape[1]
        X.append(X_train)
        buffer = np.array([[0,0,1]]*H).T
        #print(y_train.shape)
        filename = temp['filename']
        while filename == temp['filename']:
            #print(Y)
            filename = temp['filename']
            t_start = temp['onset']
            t_end = temp['offset']
            #print("Number of frames",conv_to_frame(t_end,H)-conv_to_frame(t_start,H))
            #print(temp)
            for j in range(conv_to_frame(t_start,H),conv_to_frame(t_end,H)):
                buffer[0,j] = 1
                buffer[2,j] = 0
            if k<df.shape[0]-1:
                k=k+1
                #print(k)
                temp = df.iloc[k]
            else:
                break
        i=i+1
        Y.append(buffer)
        
    X = np.array(X)
    Y = np.array(Y)
    #X = reshape_3D_mat(X)
    #Y = reshape_3D_mat(Y)
    return X, Y

def createDataBoth(df, filenum, file):
    temp = df.iloc[0]
    Y = []
    i = 0
    k=0
    X = []
    #print(df.shape[0])
    while i <= filenum-1 :
        X_train = read_from_npy(file+str(i+1)+".npy")
        X_train = stftToLogMel(X_train)
        #print(X_train.shape)
        H = X_train.shape[1]
        X.append(X_train)
        buffer = np.array([[0,0,1]]*H).T
        #print(y_train.shape)
        filename = temp['filename']
        while filename == temp['filename']:
            #print(Y)
            filename = temp['filename']
            t_start = temp['onset']
            t_end = temp['offset']
            label = temp['class']
            #print("Number of frames",conv_to_frame(t_end,H)-conv_to_frame(t_start,H))
            #print(temp)
            for j in range(conv_to_frame(t_start,H),conv_to_frame(t_end,H)):
                if label == 'music':
                    buffer[0,j] = 1
                    buffer[2,j] = 0
                else:
                    buffer[1,j] = 1
                    buffer[2,j] = 0
            if k<df.shape[0]-1:
                k=k+1
                #print(k)
                temp = df.iloc[k]
            else:
                break
        i=i+1
        Y.append(buffer)
        
    X = np.array(X)
    Y = np.array(Y)
    #X = reshape_3D_mat(X)
    #Y = reshape_3D_mat(Y)
    return X, Y

# main function

In [3]:
if __name__ == "__main__":
    
    '''The segments of code not used are commented below. The data preprocessing was done in stages, saving the files
    between intermediate stages  so as to reduce computation times and file load times.
    FIRST STAGE - 
    Converting stereo files to mono files.
    
    SECOND STAGE - 
    Storing the information in converted .wav files to .npy files as loading and working with .npy files is faster.
    
    THIRD STAGE - 
    Converting the hour long files to 10s samples and storing as .wav files. Here we also generated the timestamps
    and stored in .csv files.
    
    FOURTH STAGE - 
    Generate the STFTs for all the .wav files and store them as .npy files.
    
    FIFTH STAGE - 
    Load the STFTs, convert to Log Melspectrograms and concatenate to create the test data corresponding to Music Speech 
    and mixed files. Here we also generated the labels using the .csv file stored in stage 3.
    
    SIXTH STAGE - 
    The converted files are finally concatenated to form X_train and Y_train and are stored as .npy files. These files
    are given as input to the model for training.'''
    
    # conv_to_mono("./Training/wav_files/speech3.wav","./Training/wav_files/speech3_mono.wav")
    # store_to_npy("music1_mono.wav","music1.npy")
    # store_to_npy("./Training/wav_files/speech3_mono.wav","./Training/speech3.npy")
    # music1 = read_from_npy("./Training/music1.npy")
    # speech1 = read_from_npy("./Training/speech3.npy")
    # temp = music1[0:159999]
    # write_npy_to_wav(temp,"music_1.wav")
    
    # mLen = int(np.ceil(2/3*music1.shape[0]))  # number of samples for only music
    # sLen = int(np.ceil(2/3*speech1.shape[0])) # number of samples for only speech
    # bmLen = music1.shape[0] - mLen            # number of samples of music for both music and speech
    # bsLen = speech1.shape[0] - sLen           # number of samples of speech for both music and speech
    # print(mLen,sLen, msmLen, mssLen)
    wav_dir = './labels_wav/'
    spectrogram_dir = './labels_spectrogram/'
    music_dir = "music/"
    speech_dir = "speech/"
    both_dir = "both/"
    dirname = spectrogram_dir+speech_dir
    #temp = music1[0:mLen-1]
    #music_file_num = convert_to_segments(temp, wav_dir+music_dir, "music_noisy",'music') 
    #temp = speech1[0:sLen-1]
    #speech_file_num = convert_to_segments(temp, wav_dir+speech_dir,"speech_noisy",'speech')
    
    #temp1 = music1[mLen:]
    #temp2 = speech1[sLen:]
    #both_file_num = convert_to_mixed_segments(temp1,temp2,wav_dir+both_dir,"music+speech_noisy")
    #np.save('./file_counts.npy', np.array([music_file_num, speech_file_num, both_file_num]))
    
    
    arr = np.load('./file_counts.npy')
    music_file_num = arr[0]
    speech_file_num = arr[1]
    both_file_num = arr[2]
    
    #createSTFT(wav_dir+music_dir+'/music_noisy', music_file_num, spectrogram_dir+music_dir+'/music_noisy')
    #createSTFT(wav_dir+speech_dir+'/speech_noisy', speech_file_num, spectrogram_dir+speech_dir+'/speech_noisy')
    #createSTFT(wav_dir+both_dir+'/music+speech_noisy', both_file_num, spectrogram_dir+both_dir+'/both_noisy')
    '''labels_dir_speech = './labels_wav/speech/labels.csv'
    labels_dir_music = './labels_wav/music/labels.csv'
    labels_dir_both = './labels_wav/both/labels.csv'
    df1 = pd.read_csv(labels_dir_speech)
    file = spectrogram_dir+speech_dir+"speech_noisy"
    X_speech, Y_speech = createDataSpeech(df1,speech_file_num, file)
    df2 = pd.read_csv(labels_dir_music)
    file = spectrogram_dir+music_dir+"music_noisy"
    X_music, Y_music = createDataSpeech(df2,music_file_num, file)
    df3 = pd.read_csv(labels_dir_both)
    file = spectrogram_dir+both_dir+"both_noisy"
    X_both, Y_both = createDataBoth(df3, both_file_num, file)
    np.save('./Training/X_speech.npy',X_speech)
    np.save('./Training/Y_speech.npy',Y_speech)
    np.save('./Training/X_music.npy',X_music)
    np.save('./Training/Y_music.npy',Y_music)
    np.save('./Training/X_both.npy',X_both)
    np.save('./Training/Y_both.npy',Y_both)'''
    X_speech = np.load('./Training/X_speech.npy')
    Y_speech = np.load('./Training/Y_speech.npy')
    X_music = np.load('./Training/X_music.npy')
    Y_music = np.load('./Training/Y_music.npy')
    X_both = np.load('./Training/X_both.npy')
    Y_both = np.load('./Training/Y_both.npy')
    
    print(X_speech.shape)
    print(X_music.shape)
    print(X_both.shape)
    X_train = np.concatenate((X_speech, X_music, X_both), axis=0)
    print(X_train.shape)
    Y_train = np.concatenate((Y_speech, Y_music, Y_both), axis=0)
    Y_train = np.swapaxes(Y_train,2,1)
    print(Y_train.shape)
    np.save('./Training/X_train.npy',X_train)
    np.save('./Training/Y_train.npy',Y_train)

(497, 128, 313)
(592, 128, 313)
(587, 128, 313)
(1676, 128, 313)
(1676, 313, 3)


# Test Data Creation

In [27]:
def createTestInputSet(test_path):
    i = 0
    X = []
    while i<=9:
        file = test_path+'test_sample-'+str(i)+'.npy'
        x = np.load(file)
        x = convToLinear(x)
        x = stftToLogMel(x)
        X.append(x)
        i = i+1

    X = np.array(X)    
    return X

In [28]:
test_path = './evaluation/mocktest_set/spectrogram/'
x = createTestInputSet(test_path)
print(x.shape)
np.save('./evaluation/mocktest_set/xdemo.npy',x)

(10, 128, 313)
