In [2]:
import numpy as np
import soundfile as sf
import librosa  

# Load the .wav file
data, samplerate = sf.read("SaliencySoundTest/ambisonic_audio_4ch.wav")

def compute_audio_at_direction(W: np.ndarray, X: np.ndarray, 
                                   Y: np.ndarray, Z: np.ndarray,
                                   top_left: tuple, bottom_right: tuple) -> np.ndarray:
    """
    Compute audio waveform over a rectangular tile defined by lat/lon bounds.
    Uses a spherical harmonic filter to extract the tile region from the ambisonic channels.
    
    Args:
        W, X, Y, Z: Ambisonic channel waveforms
        top_left: Tuple of (latitude, longitude) for top-left corner in degrees
        bottom_right: Tuple of (latitude, longitude) for bottom-right corner in degrees
        
    Returns:
        wave: Audio waveform extracted from the tile region
    """
    top_left_lat, top_left_lon = top_left
    bottom_right_lat, bottom_right_lon = bottom_right
    
    # Convert bounds to radians
    lat_min_rad = np.radians(bottom_right_lat)
    lat_max_rad = np.radians(top_left_lat)
    lon_min_rad = np.radians(top_left_lon)
    lon_max_rad = np.radians(bottom_right_lon)
    
    # Integrate SH basis functions over the tile region
    # For each basis function, compute the integral over the rectangular area
    
    # Integral of Y_W = 1 over lat x lon rectangle
    integral_Y_W = (lat_max_rad - lat_min_rad) * (lon_max_rad - lon_min_rad)
    
    # Integral of cos(lat)*cos(lon) over lat x lon rectangle
    # = (sin(lat_max) - sin(lat_min)) * (sin(lon_max) - sin(lon_min))
    integral_Y_X = (np.sin(lat_max_rad) - np.sin(lat_min_rad)) * (np.sin(lon_max_rad) - np.sin(lon_min_rad))
    
    # Integral of cos(lat)*sin(lon) over lat x lon rectangle
    # = (sin(lat_max) - sin(lat_min)) * (-cos(lon_max) + cos(lon_min))
    integral_Y_Y = (np.sin(lat_max_rad) - np.sin(lat_min_rad)) * (-np.cos(lon_max_rad) + np.cos(lon_min_rad))
    
    # Integral of sin(lat) over lat x lon rectangle
    # = (-cos(lat_max) + cos(lat_min)) * (lon_max - lon_min)
    integral_Y_Z = (-np.cos(lat_max_rad) + np.cos(lat_min_rad)) * (lon_max_rad - lon_min_rad)
    
    # Apply integrated SH filter to reconstruct waveform for the tile
    wave = integral_Y_W * W + integral_Y_X * X + integral_Y_Y * Y + integral_Y_Z * Z
    
    return wave
    
print(f"Shape: {data.shape}")   # (num_samples, 4)
print(f"Sample rate: {samplerate} Hz")

# Split into channels
W = data[:, 0]
X = data[:, 1]
Y = data[:, 2]
Z = data[:, 3]

def processWave(type, wave, sampleRate):
    windowSize = 2048
    hopSize = 100
    
    # converts this to a Short-Time Fourier Transform. Tells you how much eergy has at each frequency over time.
    # does this by going through windows. Length of each window defined by n_fft. Then, shifts window to right by length
    # hop length. at each window, computes how much of each frequency is present.
    # final value is 2D array of rows being each frequency, columns being time (which is now the windows), so value being amplitude/energy for that time and frequency
    stftWave = np.abs(librosa.stft(wave, n_fft=windowSize, hop_length=hopSize))
    # when we get the mel, that just converts all the frequencies to 128 possible onces, which are moreso frequencies humans can hear. So compressing
    # the frequencies from a large number of frequencies to a smaller number, in this case n_mels amount
    mel = librosa.feature.melspectrogram(S = stftWave, sr= sampleRate, n_mels = 128)
    # converts from power scaling of audio to decibel scaling, cause humans perceive in moreso logarithm of audio (so higher sounds kinda taper off to us)
    logMel = librosa.power_to_db(mel, ref=np.max)

    # gets overall frame energy, including amplitude
    frameEnergy = np.sqrt(np.mean(logMel ** 2, axis=0))
    # gets the contrast in energy between frequencies within a specific frequency band, so where some frequencies bands may have parts of high energy frequencies, while other parts are low energy
    contrast = librosa.feature.spectral_contrast(S = stftWave, sr=sampleRate)
    # combines the difference frequency bands to get a average contrast for that time frame
    contrast = np.mean(contrast, axis=0)
    # basically gets how much the sound chagnes over time. Does this by getting differnece over time fimes with np.diff, squaring that value, and getting its sum
    temporal_novelty = np.sum(np.diff(logMel, axis=1) ** 2, axis=0)
    # do this to add an extra value cause rn, the length is T - 1, since you're getting difference between frames. So add 1 to get it to T length
    temporal_novelty = np.insert(temporal_novelty, 0, 0)

    # standard normalization
    frameEnergyNorm = frameEnergy / np.max(frameEnergy)
    contrastNorm = contrast / np.max(contrast)
    temporal_novelty_norm = temporal_novelty / np.max(temporal_novelty)

    # gets it for the overall time
    finalSaliency = 0.5 * frameEnergyNorm + 0.3 * contrastNorm + 0.2 * temporal_novelty_norm

    secondsIn = 6.5
    # divide by hoSize as first secondIn * sr gets the specific sample we want. Dividing by hopSize tells us how many windows to traverse
    # to get to that sample, as say if it's less than hopSize, its in the window at index 0, if it's slightly more, it's in the window at index 1, etc
    index = int(secondsIn * sampleRate / hopSize)


    print(f"For type of wave {type}, saliency was {finalSaliency[index]}")

for latitude in range(90, -90, -20):
    for longitude in range(-180, 180, 20):
        topLeft = (latitude, longitude)
        bottomRight = (latitude -20, longitude + 20)
        wave = compute_audio_at_direction(W, X, Y, Z, topLeft, bottomRight)
        processWave(f"Tile at bounds {topLeft} and {bottomRight}", wave, samplerate)

Shape: (2880000, 4)
Sample rate: 48000 Hz
For type of wave Tile at bounds (90, -180) and (70, -160), saliency was 0.4838381844634645
For type of wave Tile at bounds (90, -160) and (70, -140), saliency was 0.5043085572320013
For type of wave Tile at bounds (90, -140) and (70, -120), saliency was 0.4981665270952472
For type of wave Tile at bounds (90, -120) and (70, -100), saliency was 0.5085314309307212
For type of wave Tile at bounds (90, -100) and (70, -80), saliency was 0.5044543879276044
For type of wave Tile at bounds (90, -80) and (70, -60), saliency was 0.5066178393753298
For type of wave Tile at bounds (90, -60) and (70, -40), saliency was 0.5078011975706318
For type of wave Tile at bounds (90, -40) and (70, -20), saliency was 0.49887467704187094
For type of wave Tile at bounds (90, -20) and (70, 0), saliency was 0.5070804625988586
For type of wave Tile at bounds (90, 0) and (70, 20), saliency was 0.5013325237624451
For type of wave Tile at bounds (90, 20) and (70, 40), saliency