In [2]:
import typing as tp
from datetime import datetime
from pathlib import Path

import numpy as np
import scipy
import torch.cuda
from transformers import pipeline
from IPython.display import Audio

data_dir: Path = Path('.').parent / 'data'
data_dir.mkdir(exist_ok=True)


def get_device(has_low_vram: bool = False):
    device = torch.device("cuda" if torch.cuda.is_available() and not has_low_vram else "cpu")
    return device


def normalize(sample: np.ndarray, norm_rate: float = 0.9):
    norm_factor: float = np.abs(sample.max()) * norm_rate
    return sample / norm_factor


def low_pass(sample: np.ndarray, cutoff: float, sampling_rate: float) -> np.ndarray:
    sos_out: np.ndarray = scipy.signal.butter(10, cutoff, btype='lowpass', fs=sampling_rate, output='sos')
    return scipy.signal.sosfilt(sos_out, sample)


def high_pass(sample: np.ndarray, cutoff: float, sampling_rate: float) -> np.ndarray:
    sos_out: np.ndarray = scipy.signal.butter(10, cutoff, btype='highpass', fs=sampling_rate, output='sos')
    return scipy.signal.sosfilt(sos_out, sample)


def postprocess_sample(sample: np.ndarray, sampling_rate: tp.Union[int, float], do_norm: bool = False,
                       do_low: bool = False, do_high: bool = False) -> np.ndarray:
    # hyperparameters.
    low_pass_cutoff_freq: float = min(float(sampling_rate) / 2.0, 10_000)
    high_pass_cutoff_freq: float = 100.0

    print(low_pass_cutoff_freq, high_pass_cutoff_freq)

    out: np.ndarray = np.copy(sample)
    print(out.shape)
    if do_norm:
        out = normalize(out)
        print(out.min(), out.max(), out.mean())
    if do_low:
        out = low_pass(out, low_pass_cutoff_freq, sampling_rate)
        print(out.min(), out.max(), out.mean())
    if do_high:
        out = high_pass(out, high_pass_cutoff_freq, sampling_rate)
        print(out.min(), out.max(), out.mean())
    return out


def get_music_runner(has_low_memory: bool = False) -> tp.Callable[[str, tp.Optional[str]], tuple[Path, Audio]]:
    model: str = 'facebook/musicgen-small' if has_low_memory else 'facebook/musicgen-large'
    synthesiser = pipeline("text-to-audio", model=model, device=get_device(has_low_vram=has_low_memory))

    if has_low_memory and hasattr(synthesiser, 'half'):
        synthesiser = synthesiser.half()

    def prompt_to_filename(prompt: str, annot: tp.Optional[str]) -> Path:
        curr_time: str = datetime.now().strftime("%Y%m%d-%H%M%S")
        mod: str = '' if annot is None else f'_{annot}_'
        filename: str = curr_time + mod + '_'.join(prompt.lower().split(' ')[:5]) + '.wav'
        data_dir: Path = Path('.').resolve() / 'data'
        data_dir.mkdir(parents=True, exist_ok=True)
        return data_dir / filename

    def inner(prompt: str, annot: tp.Optional[str] = None) -> tuple[Path, Audio]:
        filename: Path = prompt_to_filename(prompt, annot)
        sample = synthesiser(prompt, forward_params={'do_sample': True})
        sampling_rate = sample['sampling_rate']
        # actual_audio: np.ndarray = postprocess_sample(sample['audio'], sampling_rate)
        scipy.io.wavfile.write(filename, rate=sampling_rate, data=sample['audio'])
        return filename, Audio(filename=str(filename), rate=sampling_rate)

    print('Set up music runner')
    return inner


In [3]:
music_generator = get_music_runner(has_low_memory=True)
save_path, audio_embed = music_generator('Create a pop song in the style of Mexican cumbia')
print(f'Your song should also be located at {save_path}')
audio_embed

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

Set up music runner




Your song should also be located at /home/duke_trystan/PycharmProjects/Bass-Bot/notebooks/data/20250505-230058create_a_pop_song_in.wav


For now, the DPO algorithm hasn't been implemented yet. But I found a resource to use in Tango 2 - https://github.com/declare-lab/tango/blob/master/tango2/tango2-train.py