In [None]:
from utils import process_audio_file, process_audio_file_super_fast
import os
import numpy as np
import cv2
from scipy.signal import spectrogram
from scipy.signal.windows import blackman
from matplotlib import pyplot as plt
from typing import List, Tuple
from dataclasses import dataclass
from skimage.metrics import structural_similarity as ssim
from scipy.io import wavfile
import time
from predict_and_extract_online import prepare_image_batch, ProcessingConfig
from prepare_audio import process_audio_file_super_fast2, process_audio_file_old


def process_audio_file_corrected(file_path, saving_folder="./images", batch_size=50, start_time=0, end_time=3, 
                       save=False, wlen=2048, nfft=2048, sliding_w=0.4, cut_low_frequency=3, 
                       cut_high_frequency=20, target_width_px=1167, target_height_px=875):
    """
    Process an audio file and generate spectrogram images.

    This optimized version avoids creating matplotlib figures for every spectrogram. 
    Instead, it converts the computed spectrogram (in dB) directly to a grayscale image,
    crops the frequency range, normalizes the values, resizes using OpenCV, and finally
    saves the image if requested.

    Parameters:
        file_path (str): Path to the audio file.
        saving_folder (str): Folder to save images (if save=True).
        batch_size (int): Number of spectrogram images to generate.
        start_time (float): Start time in seconds.
        end_time (float): End time in seconds.
        save (bool): Whether to save the images.
        wlen (int): Window length for spectrogram calculation.
        nfft (int): Number of FFT points.
        sliding_w (float): Duration of each slice in seconds.
        cut_low_frequency (int): Lower frequency limit (in kHz) for the spectrogram.
        cut_high_frequency (int): Upper frequency limit (in kHz) for the spectrogram.
        target_width_px (int): Target image width in pixels.
        target_height_px (int): Target image height in pixels.

    Returns:
        images (list): List of spectrogram images as numpy arrays.
        
    Raises:
        FileNotFoundError: If the audio file is not found.
    """
    import os
    import numpy as np
    import cv2
    from scipy.signal import spectrogram
    # Use NumPy’s Blackman window; alternatively, you can import from scipy.signal.windows
    win = blackman(wlen, sym=False)
    hop = round(0.8 * wlen)  # window hop size
    try:
        from scipy.io import wavfile
        fs, x = wavfile.read(file_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"File {file_path} not found.")
    
    # Create saving folder if saving is enabled
    if save and not os.path.exists(saving_folder):
        os.makedirs(saving_folder)
    
    images = []
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    N = len(x)
    if end_time is not None:
        N = min(N, int(end_time * fs))
    low = int(start_time * fs)
    new_samples_per_slice = int(sliding_w * fs)
    samples_per_slice = int(0.8 * fs)

    # Pre-calculate frequency cropping indices later using the frequency array (f) from the first slice
    first_slice = True
    for _ in range(batch_size):
        if low + samples_per_slice > N:
            break
        
        x_w = x[low:low + samples_per_slice]
        win = blackman(wlen, sym=False)
        f, t, Sxx = spectrogram(x_w, fs, nperseg=wlen, noverlap=wlen-hop, nfft=nfft, window=win)
        # Convert to dB scale as in original
        Sxx = 10 * np.log10(np.abs(Sxx) + 1e-14)
        
        if first_slice:
            # f is in Hz; use kHz limits
            low_freq_hz = cut_low_frequency * 1000
            high_freq_hz = cut_high_frequency * 1000
            low_idx = np.searchsorted(f, low_freq_hz)
            high_idx = np.searchsorted(f, high_freq_hz)
            first_slice = False
        
        # Crop frequency axis as original (note: original divides f by 1000 for plotting,
        # but we use the indices determined from the Hz values)
        Sxx_cropped = Sxx[low_idx:high_idx, :]
        
        # Mimic pcolormesh default normalization (per-slice dynamic range)
        vmin = Sxx_cropped.min()
        vmax = Sxx_cropped.max()
        # Prevent division by zero in flat spectra
        norm = (Sxx_cropped - vmin) / (vmax - vmin + 1e-14)
        # Map to 0-255 grayscale
        img_gray = np.uint8(255 * norm)
        
        # Resize image to target dimensions; using INTER_LINEAR for smoother interpolation
        resized =  cv2.resize(img_gray, (target_width_px, target_height_px), interpolation=cv2.INTER_NEAREST)
        # Convert to 3-channel image by stacking the grayscale image three times
        image = np.stack([resized, resized, resized], axis=2)
        
        if save:
            image_name = os.path.join(saving_folder, f"{file_name}-{low/fs:.2f}_fast.jpg")
            cv2.imwrite(image_name, image)
        
        images.append(image)
        low += new_samples_per_slice

    return images


config = ProcessingConfig(
        batch_duration=None,
        batch_size=50,
        cut_low_frequency=None,
        cut_high_frequency=None,
        image_normalize=False,
        image_size=(224, 224),  # Standard size for most models
        save_positive_examples = True,
    )

wavtest = "/home/emanuelli/Téléchargements/test.wav"

process_audio_file_super_fast(wavtest, batch_size=50, save=True, wlen=2048, nfft=2048, sliding_w=0.4, cut_low_frequency=3, cut_high_frequency=20)

2025-03-26 10:24:53.015929: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-26 10:24:53.025506: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-26 10:24:53.101994: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-26 10:24:53.162842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-26 10:24:53.222211: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

[array([[[174, 174, 174],
         [174, 174, 174],
         [174, 174, 174],
         ...,
         [181, 181, 181],
         [181, 181, 181],
         [181, 181, 181]],
 
        [[174, 174, 174],
         [174, 174, 174],
         [174, 174, 174],
         ...,
         [181, 181, 181],
         [181, 181, 181],
         [181, 181, 181]],
 
        [[174, 174, 174],
         [174, 174, 174],
         [174, 174, 174],
         ...,
         [181, 181, 181],
         [181, 181, 181],
         [181, 181, 181]],
 
        ...,
 
        [[177, 177, 177],
         [177, 177, 177],
         [177, 177, 177],
         ...,
         [144, 144, 144],
         [144, 144, 144],
         [144, 144, 144]],
 
        [[161, 161, 161],
         [161, 161, 161],
         [161, 161, 161],
         ...,
         [121, 121, 121],
         [121, 121, 121],
         [121, 121, 121]],
 
        [[161, 161, 161],
         [161, 161, 161],
         [161, 161, 161],
         ...,
         [121, 121, 121],
  

In [12]:
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import spectrogram
from scipy.signal.windows import blackman

import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import spectrogram
from scipy.signal.windows import blackman
import cv2

def process_audio_file_matlab_match(file_path, saving_folder='images/', 
                                    starting_record_index=1, 
                                    sliding_w=0.4, 
                                    cut_low_frequency=3, 
                                    cut_high_frequency=20,
                                    wlen=2048, 
                                    nfft=2048):
    """
    Process audio file to generate spectrograms matching MATLAB implementation.
    
    Parameters:
    - file_path (str): Path to the audio file
    - saving_folder (str): Folder to save spectrogram images
    - starting_record_index (int): Starting index for file processing
    - sliding_w (float): Sliding window size in seconds
    - cut_low_frequency (int): Lower frequency cutoff in kHz
    - cut_high_frequency (int): Upper frequency cutoff in kHz
    - wlen (int): Window length
    - nfft (int): Number of FFT points
    """
    # Ensure saving folder exists
    os.makedirs(saving_folder, exist_ok=True)
    
    # Read audio file
    fs, x = wavfile.read(file_path)
    x = x.astype(np.single)  # Convert to single precision
    
    # Prepare window and hop size
    hop = round(0.8 * wlen)
    win = blackman(wlen, sym=False)
    
    # Prepare file name
    file_name = os.path.basename(file_path)
    
    # Process the audio file
    N = len(x)
    low = 1
    up = low + int(0.8 * fs) - 1
    file_name_ex = 0
    
    while up <= N:
        # Extract window
        x_w = x[low-1:up]
        
        # Calculate spectrogram
        f, t, Ps = spectrogram(x_w, fs, window=win, nperseg=wlen, 
                               noverlap=wlen-hop, nfft=nfft, 
                               mode='magnitude', scaling='spectrum')
        
        # Convert frequency to kHz
        f = f / 1000
        
        # Convert to dB
        Pf = 20 * np.log10(np.abs(Ps) + 1e-10)
        
        # Filter frequency range
        low_idx = np.searchsorted(f, cut_low_frequency)
        high_idx = np.searchsorted(f, cut_high_frequency)
        Pf_filtered = Pf[low_idx:high_idx, :]
        f_filtered = f[low_idx:high_idx]
        
        # Create figure without using pyplot
        fig, ax = plt.subplots(figsize=(9.03, 6.77), dpi=1000)
        ax.imshow(Pf_filtered, aspect='auto', cmap='gray', origin='lower', 
                  extent=[t[0], t[-1], f_filtered[0], f_filtered[-1]])
        
        # Remove axes
        ax.axis('off')
        
        # Adjust figure to remove extra whitespace
        fig.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0, hspace=0)
        
        # Save figure to a temporary file
        temp_path = os.path.join(saving_folder, 'temp.png')
        fig.savefig(temp_path, bbox_inches='tight', pad_inches=0, dpi=100)
        plt.close(fig)
        
        # Read the image and remove potential borders
        img = cv2.imread(temp_path, cv2.IMREAD_GRAYSCALE)
        
        # Resize to exact dimensions if needed
        img_resized = cv2.resize(img, (1167, 875), interpolation=cv2.INTER_AREA)
        
        # Save the final image
        save_path = os.path.join(saving_folder, f'{file_name}-{file_name_ex}.jpg')
        cv2.imwrite(save_path, img_resized)
        
        # Remove temporary file
        os.remove(temp_path)
        
        # Prepare for next iteration
        low = low + int(sliding_w * fs)
        file_name_ex += sliding_w
        up = low + int(0.8 * fs) - 1

# Example usage
# process_audio_file_matlab_match('/path/to/your/audio/file.wav')

# Example usage
process_audio_file_matlab_match(wavtest)