In [1]:
!pip install speechbrain torchaudio torch




In [2]:
import speechbrain
print(speechbrain.__version__)


1.0.1


In [None]:
!pip install speechbrain # if you don't get the above cell, then only run this

#### the librosa is not using any deep learning technology, but it's there to just make comparison to what the non pretrained model will produce the result like

In [8]:
# Install necessary libraries
!pip install noisereduce torchaudio librosa

# Import required libraries
import os  # For checking if the file exists
import torch  # Importing PyTorch
import torchaudio
import noisereduce as nr

# Define the audio file path
audio_file = "example.wav"  # Replace with your actual audio file path

# Check if the audio file exists
if not os.path.isfile(audio_file):
    raise FileNotFoundError(f"The file '{audio_file}' does not exist. Please check the path.")

# Load the audio file
signal, sr = torchaudio.load(audio_file)

# Convert to 1D array (if needed)
if signal.shape[0] > 1:
    signal = signal.mean(dim=0)  # Convert to mono by averaging channels

# Apply noise reduction
denoised_signal = nr.reduce_noise(y=signal.numpy(), sr=sr)

# Convert denoised signal back to a tensor
denoised_tensor = torch.tensor(denoised_signal)

# Ensure the tensor is 2D before saving
if denoised_tensor.ndim == 1:
    denoised_tensor = denoised_tensor.unsqueeze(0)  # Ensure it's a 2D tensor (1, samples)
elif denoised_tensor.ndim > 2:
    raise ValueError(f"Unexpected tensor dimensions: {denoised_tensor.ndim}. Expected 1D or 2D tensor.")

# Save the denoised audio
torchaudio.save("denoised_example.wav", denoised_tensor, sr)

print("Denoising completed. The denoised audio is saved as 'denoised_example.wav'.")


Denoising completed. The denoised audio is saved as 'denoised_example.wav'.


#### Below is the pretrained model from speechbrain and you'll see that there is a contrasting difference between the audio cleared from a pretrained model and one from librosa

In [3]:
import torchaudio
import torch
import soundfile as sf
import numpy as np
from speechbrain.inference import SepformerSeparation as separator
from IPython.display import Audio


In [4]:
# Load the pre-trained deep learning model for denoising
model = separator.from_hparams(source="speechbrain/sepformer-wham16k-enhancement", savedir="pretrained_models/sepformer")


  state_dict = torch.load(path, map_location=device)


In [5]:
# Load your noisy audio file
audio_file = "example.wav"  # Replace with the path to your audio file
signal, sr = torchaudio.load(audio_file)


##### Overclock your GPU(Just a suggestion, you can run the file faster)

In [6]:
# Apply the denoising model (deep learning)
denoised_signal = model.separate_file(audio_file)

# Ensure the tensor is 2D (channel x samples) by selecting only one channel if necessary
if len(denoised_signal.shape) == 4:
    denoised_signal = denoised_signal.squeeze(0)  # Remove batch dimension

if len(denoised_signal.shape) == 3:
    denoised_signal = denoised_signal[0]  # Select the first source

# Ensure the tensor is 2D (channels x samples)
if len(denoised_signal.shape) != 2:
    raise ValueError("The denoised_signal tensor should be 2D (channels x samples)")

# Convert tensor to numpy array (required by soundfile)
denoised_signal = denoised_signal.numpy()

# Check data type and convert to float32 if necessary
if denoised_signal.dtype != 'float32':
    denoised_signal = denoised_signal.astype('float32')

# Check for NaNs or infinite values
if np.any(np.isnan(denoised_signal)) or np.any(np.isinf(denoised_signal)):
    raise ValueError("The denoised signal contains NaN or infinite values.")


In [7]:
# Play the denoised audio in the notebook
audio_player = Audio(denoised_signal.T, rate=sr)  # Transpose for channels x samples

# Return the audio player to display it in the output
audio_player
