In [9]:
import audio 
import librosa
import numpy as np

In [10]:
%%bash
ls /import/c4dm-datasets/SpeakerRecognitionDatasets/ASVSpoof2017/ASVspoof2017_train_dev/wav/train/ | head

T_1000001.wav
T_1000002.wav
T_1000003.wav
T_1000004.wav
T_1000005.wav
T_1000006.wav
T_1000007.wav
T_1000008.wav
T_1000009.wav
T_1000010.wav


In [11]:
base='/import/c4dm-datasets/SpeakerRecognitionDatasets/ASVSpoof2017/ASVspoof2017_train_dev/wav/train/'


In [12]:
t1=base+'T_1000001.wav'

In [30]:
def update_audio_samples(fs, samples, threshold, remove=0.1, removeFlag=False):
    '''
    Inputs:
    audioFile : the absolute path of the audio file
    threshold : is in seconds. Used to trim and append audio samples
    remove    : is in seconds. audio file to remove from start.Default is 100ms that is equal to 1600 raw samples
                and avoid computational issues.                
    Output:
    trimmed/appended audio file of length given by threshold
    
    Threshold is in seconds. So convert it into samples first.
    If file is large, we throw away samples after the threshold else we copy the samples to match threshold
    '''
        
    #fs, samples = wav.read(audioFile)
    # We do this using librosa in the calling function now
    
    #If removeFlag is set then remove samples from start
    if removeFlag:
        remove = int(remove * fs)         
        samples = samples[remove:]                                          
    
    threshold_samples = threshold * fs      
    audio_length = len(samples)/fs
            
    if audio_length < threshold:   #replicate the samples to match threshold        
        n=0
        while n<threshold_samples:            
            samples = np.tile(samples, 3) #appends 3 copies of samples            
            n+=len(samples)            
        samples = samples[0:threshold_samples] #just take threshold_samples                   
        
    elif audio_length > threshold:        
        samples = samples[0:threshold_samples] #just take threshold_samples and ignore rest
        
    #print('New length after appending/truncating = %s seconds' % (len(samples)/fs))
    
    return samples

def compute_spectrogram(input_type, filename, fft_size=512, win_size=512, hop_size=160, duration=1):
                
    samples, sr = librosa.load(filename, sr=None, dtype=np.float32)    

    #Truncate or append samples based on duration
    if duration != 0:
        samples = update_audio_samples(sr,samples,duration)
               
    #Take the FFT
    D = librosa.stft(samples,fft_size,hop_size,win_size)    
    
    if input_type == 'mag_spec': #power magnitude spectrogram        
        D= np.log(np.maximum((np.abs(D)**2), 1e-7))
    elif input_type == 'mel_spec':
        print('to do for mel spectrogram code')
    elif input_type == 'cqt_spec':
        print('to do for cqt spectrogram code')
                    
    r,c = D.shape
    
    #Let us return the spectrogram matrix in timeXfrequency format by taking transpose
    #return np.transpose(D[:, 0:c-1])
    return np.transpose(D)

In [33]:
s = compute_spectrogram('mag_spec', t1, fft_size=512, win_size=512, hop_size=160, duration=0)

In [34]:
print(s.shape)

(243, 257)
