In [4]:
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
import librosa
import librosa.display
import os
NORMALIZE_VOLUME =4 
def standardize(audio):
    """
    Preprocess audio file, including setting sample rate, bit depth, channels, and volume normalization.

    Args:
        audio (str or AudioSegment): Audio file path or AudioSegment object, the audio to be preprocessed.

    Returns:
        dict: A dictionary containing the preprocessed audio waveform, audio file name, and sample rate, formatted as:
              {
                  "waveform": np.ndarray, the preprocessed audio waveform, dtype is np.float32, shape is (num_samples,)
                  "name": str, the audio base name
              }

    Raises:
        ValueError: If the audio parameter is neither a str nor an AudioSegment.
    """

    # global audio_count
    name = "audio"

    if isinstance(audio, str):
        name = os.path.basename(audio)
        audio = AudioSegment.from_file(audio)
    else:
        raise ValueError("Invalid audio type")

    # Convert the audio file to WAV format
    audio = (audio.set_frame_rate(24000)).set_sample_width(2).set_channels(1)

    # Calculate the gain to be applied
    target_dBFS = -20
    gain = target_dBFS - audio.dBFS
    print(f"Calculating the gain needed for the audio: {gain} dB")

    # Normalize volume and limit gain range to between -3 and 3
    normalized_audio = audio.apply_gain(min(max(gain, -NORMALIZE_VOLUME), NORMALIZE_VOLUME))

    waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
    max_amplitude = np.max(np.abs(waveform))
    waveform /= max_amplitude  # Normalize
    return {
        "waveform": waveform,
        "name": name
    }