In [None]:
import os 
import numpy as np
import torch
import torchaudio
from torchaudio.transforms import Spectrogram
from scipy.io import wavfile
import librosa

# Notebook to process audio into STFT 

We pre-process the binaural RIRs from SoundSpaces 1.0 for faster training. 
We subsample audio from 44100Hz to 22050Hz, and compute the Short-Time Fourier Transform (STFT).

## Utils

The code is similar to NAF repository: https://github.com/aluo-x/Learning_Neural_Acoustic_Fields/blob/master/data_loading/data_maker.ipynb

In [None]:
def load_audio(path_name, use_torch=True, resample=False, resample_rate=22050, clip=True):
    if use_torch:
        loaded = torchaudio.load(path_name)
        wave_data_loaded = loaded[0].numpy()
        sr_loaded = loaded[1]
    else:
        loaded = wavfile.read(path_name)
        if clip:
            wave_data_loaded = np.clip(loaded[1], -1.0, 1.0).T
        else:
            wave_data_loaded = loaded[1].T
        sr_loaded = loaded[0]

    if resample:
        if wave_data_loaded.shape[1]==0:
            assert False
        if wave_data_loaded.shape[1]<int(sr_loaded*0.1):
            padded_wav = librosa.util.fix_length(wave_data_loaded, int(sr_loaded*0.1))
            resampled_wave = librosa.resample(padded_wav, orig_sr=sr_loaded, target_sr=resample_rate)
        else:
            resampled_wave = librosa.resample(wave_data_loaded, orig_sr=sr_loaded, target_sr=resample_rate)
    else:
        resampled_wave = wave_data_loaded
    
    if clip:
        return np.clip(resampled_wave, -1.0, 1.0)
    else:
        return resampled_wave

In [None]:
class get_spec():
    def __init__(self, fft_size=512):
        self.n_fft=fft_size
        self.hop = self.n_fft//4
        self.spec_transform = Spectrogram(power=None, n_fft=self.n_fft, hop_length=self.hop)
        
    def transform(self, audio_data_prepad):
        waveforms = librosa.util.fix_length(data=audio_data_prepad, size=audio_data_prepad.shape[-1]+self.n_fft//2)
        if waveforms.shape[-1]<4410:
            waveforms = librosa.util.fix_length(data=waveforms, size=4410)

        transformed_data = self.spec_transform(torch.from_numpy(waveforms)).numpy()
        
        real_component = np.abs(transformed_data)

        return real_component

## Processing

In [None]:
scene = 'office_4'

In [None]:
orientations = ['0', '90', '180', '270']
spec_getter = get_spec()

In [None]:

for rot in orientations:
    print('Processing orientation', rot)
    files_rot = sorted(os.listdir(os.path.join(scene, 'binaural_rirs', rot)))   
    output_path_real = os.path.join(scene, 'binaural_magnitudes_sr22050', rot)

    if not os.path.exists(output_path_real):
        os.makedirs(output_path_real)

    r_s_indexes = [elt.split('.')[0] for elt in files_rot] # receiver-source format

    ff_count = 0
    for r_s in r_s_indexes:
        if ff_count % 500==0: # track progress
            print('Processing', ff_count, 'out of', len(r_s_indexes))
        
        # Load SoundSpaces 1.0 binaural RIR
        audio_file = r_s + '.wav'
        audio_path = os.path.join(scene, 'binaural_rirs', rot, audio_file)
        loaded_audios = load_audio(audio_path, use_torch = False, resample=True, clip=True) # same as NAF 

        # Compute Magnitude STFT
        raw_real = spec_getter.transform(loaded_audios)

        # Save Magnitude STFT
        save_path = os.path.join(output_path_real, r_s + '.npy')
        np.save(save_path, raw_real)

        ff_count += 1