# Downsample Songs for Dataset Augmentation

This section of the code is responsible for processing the high-quality audio tracks from the **MUSDB18-HQ** dataset and applying **downsampling** to simulate lower-fidelity audio. 

The goal is to artificially degrade the original audio in a controlled way, generating paired examples for supervised learning in an audio restoration task. These downsampled versions serve as inputs to a model that learns to reconstruct the original high-quality signal.

This preprocessing step is essential for building a **paired dataset**:  
- **Input:** degraded (low-quality, downsampled) audio  
- **Target:** clean (high-quality, original) audio  

Downsampling mimics the artifacts found in generative models like MusicGen, such as reduced frequency resolution and loss of detail, and enables the model to learn how to recover high-fidelity characteristics from low-resolution input.

In [None]:
import numpy as np
import scipy.signal as signal
import librosa
from colorednoise import powerlaw_psd_gaussian
import soundfile as sf
import os
import random
import tqdm
from concurrent.futures import ProcessPoolExecutor

PATH_CLEAN = '../../data/train/clean'
PATH_DEGRA = '../../data/train/degraded'

# Random Downsample

In [None]:
def add_white_noise(audio, snr_db):
    rms_signal = np.sqrt(np.mean(audio**2))
    rms_noise = rms_signal / (10**(snr_db / 20))
    noise = np.random.normal(0, rms_noise, audio.shape)
    return audio + noise

def add_pink_noise(audio, snr_db):
    rms_signal = np.sqrt(np.mean(audio**2))
    pink = powerlaw_psd_gaussian(1, len(audio))
    pink = pink / np.sqrt(np.mean(pink**2))  
    rms_noise = rms_signal / (10**(snr_db / 20))
    return audio + pink * rms_noise

def add_brown_noise(audio, snr_db):
    rms_signal = np.sqrt(np.mean(audio**2))
    brown = powerlaw_psd_gaussian(2, len(audio))
    brown = brown / np.sqrt(np.mean(brown**2))
    rms_noise = rms_signal / (10**(snr_db / 20))
    return audio + brown * rms_noise

def add_salt_pepper_noise(audio, prob=0.005):
    noisy = audio.copy()
    mask = np.random.rand(len(audio))
    noisy[mask < prob / 2] = -1.0 
    noisy[mask > 1 - prob / 2] = 1.0
    return noisy

def apply_reverb(audio, rir):
    return signal.fftconvolve(audio, rir, mode='full')[:len(audio)]

def apply_echo(audio, sr, delay_ms=150, decay=0.5):
    delay_samples = int(sr * delay_ms / 1000)
    echo_signal = np.zeros_like(audio)
    if delay_samples < len(audio):
        echo_signal[delay_samples:] = audio[:-delay_samples]
    return audio + decay * echo_signal

def apply_clipping(audio, threshold=0.5):
    return np.clip(audio, -threshold, threshold)

def apply_bitcrush(audio, bit_depth=8):
    levels = 2 ** bit_depth
    return np.round(audio * levels) / levels

def apply_lowpass(audio, sr, cutoff_hz=4000):
    b, a = signal.butter(6, cutoff_hz / (sr / 2), btype='low')
    return signal.lfilter(b, a, audio)

def apply_highpass(audio, sr, cutoff_hz=1000):
    b, a = signal.butter(6, cutoff_hz / (sr / 2), btype='high')
    return signal.lfilter(b, a, audio)

def apply_bandpass(audio, sr, low_hz, high_hz):
    b, a = signal.butter(4, [low_hz / (sr / 2), high_hz / (sr / 2)], btype='band')
    return signal.lfilter(b, a, audio)

def apply_dropout(audio, num_dropouts=5, dropout_len=1000):
    corrupted = audio.copy()
    for _ in range(num_dropouts):
        start = np.random.randint(0, len(audio) - dropout_len)
        corrupted[start:start + dropout_len] = 0
    return corrupted

def apply_jitter(audio, max_shift=2):
    jittered = np.zeros_like(audio)
    for i in range(len(audio)):
        shift = np.random.randint(-max_shift, max_shift + 1)
        index = np.clip(i + shift, 0, len(audio) - 1)
        jittered[i] = audio[index]
    return jittered


In [None]:
removed_files = 0
fun_list = [
    add_white_noise,
    add_pink_noise,
    add_brown_noise,
    add_salt_pepper_noise,
    apply_reverb,
    apply_echo,
    apply_clipping,
    apply_bitcrush,
    apply_lowpass,
    apply_highpass,
    apply_bandpass,
    apply_dropout,
    apply_jitter,
]

def get_degradation_score(func_name, params):
    if func_name == "add_white_noise" or func_name == "add_pink_noise" or func_name == "add_brown_noise":
        snr_db = params.get("snr_db", 10)
        return max(0.0, min(1.0, 0.9 - snr_db / 20))
    
    elif func_name == "apply_bitcrush":
        bit_depth = params.get("bit_depth", 8)
        return max(0.0, min(1.0, 1.0 - bit_depth / 16))
    
    elif func_name == "apply_pitch_shift":
        n_steps = abs(params.get("n_steps", 0))
        return max(0.0, min(1.0, n_steps / 12))
    
    elif func_name == "apply_time_stretch":
        rate = params.get("rate", 1.0)
        return max(0.0, min(1.0, abs(1 - rate)))
    
    elif func_name == "apply_echo":
        decay = params.get("decay", 0.5)
        return max(0.0, min(1.0, 0.5 + decay / 2))  
    
    elif func_name == "apply_reverb":
        rir_std = params.get("rir_std", 0.01)
        return max(0.0, min(1.0, rir_std * 100))  
    
    elif func_name == "apply_clipping":
        threshold = params.get("threshold", 0.5)
        return max(0.0, min(1.0, 1.0 - threshold * 2))
    
    elif func_name == "add_salt_pepper_noise":
        prob = params.get("prob", 0.005)
        return max(0.0, min(1.0, prob * 100))
    
    elif func_name == "apply_dropout":
        num_dropouts = params.get("num_dropouts", 5)
        dropout_len = params.get("dropout_len", 1000)
        score = (num_dropouts * dropout_len) / 100000  
        return max(0.0, min(1.0, score))
    
    elif func_name == "apply_jitter":
        max_shift = params.get("max_shift", 2)
        return max(0.0, min(1.0, max_shift / 10))
    
    elif func_name == "apply_lowpass" or func_name == "apply_highpass":
        cutoff_hz = params.get("cutoff_hz", 3000)
        return max(0.0, min(1.0, (4000 - cutoff_hz) / 4000))  
    
    elif func_name == "apply_bandpass":
        low_hz = params.get("low_hz", 500)
        high_hz = params.get("high_hz", 3000)
        band_width = high_hz - low_hz
        return max(0.0, min(1.0, 1 - band_width / 4000))
    
    elif func_name == "apply_speed_change":
        rate = params.get("rate", 1.0)
        return max(0.0, min(1.0, abs(1 - rate)))
    
    else:
        return 0.5 
    

with open("../../data/degradation_points.txt", "w") as f:

    for file_audio in tqdm.tqdm(os.listdir(PATH_CLEAN), desc="Degradazione audio"):
        if not file_audio.endswith(('.wav', '.mp3', '.flac')):
            continue
        full_path = os.path.join(PATH_CLEAN, file_audio)

        try:
            audio, sr = librosa.load(full_path, sr=None)
            func = random.choice(fun_list)
            func_name = func.__name__

            params = {}
            if func_name == "apply_reverb":
                rir_std = 0.01
                rir = np.random.randn(2048) * rir_std
                processed = func(audio, rir)
                params = {"rir_std": rir_std}
            
            elif func_name == "apply_echo":
                delay_ms = 120
                decay = 0.6
                processed = func(audio, sr, delay_ms=delay_ms, decay=decay)
                params = {"delay_ms": delay_ms, "decay": decay}
            
            elif func_name == "apply_lowpass":
                cutoff_hz = 3000
                processed = func(audio, sr, cutoff_hz=cutoff_hz)
                params = {"cutoff_hz": cutoff_hz}
            
            elif func_name == "apply_highpass":
                cutoff_hz = 2000
                processed = func(audio, sr, cutoff_hz=cutoff_hz)
                params = {"cutoff_hz": cutoff_hz}
            
            elif func_name == "apply_bandpass":
                low_hz = 500
                high_hz = 3000
                processed = func(audio, sr, low_hz=low_hz, high_hz=high_hz)
                params = {"low_hz": low_hz, "high_hz": high_hz}
            
            elif func_name == "add_white_noise":
                snr_db = 10
                processed = func(audio, snr_db=snr_db)
                params = {"snr_db": snr_db}
            
            elif func_name == "add_pink_noise":
                snr_db = 10
                processed = func(audio, snr_db=snr_db)
                params = {"snr_db": snr_db}
            
            elif func_name == "add_brown_noise":
                snr_db = 10
                processed = func(audio, snr_db=snr_db)
                params = {"snr_db": snr_db}
            
            elif func_name == "add_salt_pepper_noise":
                prob = 0.005
                processed = func(audio, prob=prob)
                params = {"prob": prob}
            
            elif func_name == "apply_clipping":
                threshold = 0.4
                processed = func(audio, threshold=threshold)
                params = {"threshold": threshold}
            
            elif func_name == "apply_bitcrush":
                bit_depth = 6
                processed = func(audio, bit_depth=bit_depth)
                params = {"bit_depth": bit_depth}
            
            elif func_name == "apply_dropout":
                num_dropouts = 5
                dropout_len = 1000
                processed = func(audio, num_dropouts=num_dropouts, dropout_len=dropout_len)
                params = {"num_dropouts": num_dropouts, "dropout_len": dropout_len}
            
            elif func_name == "apply_jitter":
                max_shift = 2
                processed = func(audio, max_shift=max_shift)
                params = {"max_shift": max_shift}
            
            elif func_name == "apply_pitch_shift":
                n_steps = 2
                processed = func(audio, sr, n_steps=n_steps)
                params = {"n_steps": n_steps}
            
            else:
                processed = audio
                params = {}


            score = get_degradation_score(func_name, params)

            output_path = os.path.join(PATH_DEGRA, f"{os.path.splitext(file_audio)[0]}_{func_name}.mp3")
            sf.write(output_path, processed, sr)

            f.write(f"{output_path}\t{score:.3f}\n")
        
        except:
            os.remove(full_path)
            removed_files += 1
            continue

print(f"Elaboration completed. File(s) deleted: {removed_files}")

Degradazione audio:  10%|▉         | 774/8000 [04:14<23:25,  5.14it/s]  [src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
  audio, sr = librosa.load(full_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sr = librosa.load(full_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sr = librosa.load(full_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Degradazione audio:  30%|███       | 2404/8000 [14:30<28:23,  3.28it/s]  Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resyn

Elaboration completed. File(s) deleted: 3



