In [184]:
import torch
import torchaudio
from IPython.display import Audio
import torchaudio.functional as F

import numpy as np

# from torchaudio.utils import download_asset

In [185]:
def random_codec_augmentation(waveform, sample_rate):
    
    num_augmentations = np.random.randint(4, 20)
    
    augmentations = [
        lambda w, sr: torchaudio.functional.apply_codec(w, sr, format="mp3"),
        lambda w, sr: torchaudio.functional.apply_codec(w, sr, format="ogg"),
        lambda w, sr: torchaudio.functional.apply_codec(w, sr, format="vorbis", compression=-1),
        lambda w, sr: torchaudio.functional.apply_codec(w, sr, format="wav", encoding="ULAW", bits_per_sample=8),
    ]
    
    
    for i in range(num_augmentations):
        augmentation = augmentations[torch.randint(len(augmentations), (1,)).item()]
        waveform = augmentation(waveform, sample_rate)
        
    return waveform, sample_rate

In [186]:
path = "/data/a.varlamov/LJSpeech-1.1/wavs/LJ001-0011.wav"

In [187]:
# Example usage
waveform, sample_rate = torchaudio.load(path)
augmented_waveform, sample_rate = random_codec_augmentation(waveform, sample_rate)

In [188]:
augmented_waveform = augmented_waveform.numpy()

In [189]:
Audio(waveform, rate=sample_rate)

In [190]:
Audio(augmented_waveform, rate=sample_rate)

In [191]:
waveform.shape

torch.Size([1, 99485])

In [192]:
augmented_waveform.shape

(1, 100800)

---

In [193]:
import numpy as np
from pydub import AudioSegment
import tempfile
import os
import random

def apply_random_codec(waveform, sample_rate):
    """
    Randomly applies one of the available codecs to the waveform.

    Parameters:
        waveform (numpy.ndarray): The audio waveform.
        sample_rate (int): The sampling rate of the waveform.

    Returns:
        numpy.ndarray: The waveform after codec augmentation.
    """
    if np.random.uniform(0, 1) <= 0.5:
        return waveform, sample_rate
    
    # List of codecs and bitrates
    codecs = ['mp3', 'ogg', 'flac', 'wav']
    codec = random.choice(codecs)
    bitrate = random.choice(['64k', '128k', '192k'])
    
    if codec == "ogg":
        bitrate = "64k"
        
    codec = "flac"
    
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_in:
        temp_in_path = temp_in.name
        temp_out_path = temp_in_path.replace('.wav', f'.{codec}')
        
        # Save waveform to a temporary WAV file
        AudioSegment(
            waveform.tobytes(),
            frame_rate=sample_rate,
            sample_width=waveform.dtype.itemsize,
            channels=1
        ).export(temp_in_path, format="wav")
    
    try:
        # Apply random codec
        audio = AudioSegment.from_file(temp_in_path)
        audio.export(temp_out_path, format=codec, bitrate=bitrate)
        
        # Read back the processed file
        processed_audio = AudioSegment.from_file(temp_out_path)
        processed_waveform = np.array(processed_audio.get_array_of_samples(), dtype=np.float32) / (2**15)
    finally:
        os.remove(temp_in_path)
        if os.path.exists(temp_out_path):
            os.remove(temp_out_path)
    
    return processed_waveform, sample_rate


In [194]:
# augmented_waveform, sample_rate = apply_random_codec(augmented_waveform, sample_rate)

In [195]:
# augmented_waveform

In [196]:
# Audio(augmented_waveform, rate=sample_rate)

---
## Encodec: