In [None]:
import subprocess

try:
    result = subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    print(result.stdout)
except FileNotFoundError:
    print("FFmpeg is not accessible from Python.")


In [5]:
from audioldm import text_to_audio, build_model

# Load AudioLDM
def load_audioldm_model():
    model = build_model(model_name="audioldm-s-full")
    return model

# Process Audio + Text Input
def process_audio_text(input_audio, text_prompt, duration, guidance_scale, random_seed):
    # Load AudioLDM model
    audioldm = load_audioldm_model()

    # Generate audio based on the text prompt
    print("Generating audio from text prompt...")
    generated_audio = text_to_audio(
        latent_diffusion=audioldm,
        text=text_prompt,
        seed=random_seed,
        duration=duration,
        guidance_scale=guidance_scale,
    )[0]  # Take the first candidate

    return generated_audio

prompt = "A calm and peaceful music track with a lofi beats."

# Generate audio
generated_audio = process_audio_text(input_audio=None, text_prompt=prompt, duration=15, guidance_scale=2.5, random_seed=42)


Load AudioLDM: %s audioldm-s-full
DiffusionWrapper has 185.04 M params.


  WeightNorm.apply(module, name, dim)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

Generating audio from text prompt...
Generate audio using text A calm and peaceful music track with a lofi beats.


DDIM Sampler: 100%|██████████| 200/200 [00:50<00:00,  3.98it/s]


In [3]:
generated_audio

array([[0.00018955, 0.00016649, 0.00017258, ..., 0.00017345, 0.00022982,
        0.00013171]], dtype=float32)

In [6]:
# Play `generated_audio`

import IPython.display as ipd

ipd.Audio(generated_audio, rate=22050)  # Load a NumPy array



In [8]:
# Inspect model:

audioldm = load_audioldm_model()

audioldm

Load AudioLDM: %s audioldm-s-full
DiffusionWrapper has 185.04 M params.


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LatentDiffusion(
  (model): DiffusionWrapper(
    (diffusion_model): UNetModel(
      (time_embed): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): SiLU()
        (2): Linear(in_features=512, out_features=512, bias=True)
      )
      (film_emb): Linear(in_features=512, out_features=512, bias=True)
      (input_blocks): ModuleList(
        (0): TimestepEmbedSequential(
          (0): Conv2d(8, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (1-2): 2 x TimestepEmbedSequential(
          (0): ResBlock(
            (in_layers): Sequential(
              (0): GroupNorm32(32, 128, eps=1e-05, affine=True)
              (1): SiLU()
              (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (h_upd): Identity()
            (x_upd): Identity()
            (emb_layers): Sequential(
              (0): SiLU()
              (1): Linear(in_features=1024, out_features=128, bias=Tr

In [10]:
from pprint import pprint

pprint(audioldm.__dict__)    

{'_backward_hooks': OrderedDict(),
 '_backward_pre_hooks': OrderedDict(),
 '_buffers': {'alphas_cumprod': tensor([9.9850e-01, 9.9699e-01, 9.9548e-01, 9.9397e-01, 9.9244e-01, 9.9092e-01,
        9.8938e-01, 9.8784e-01, 9.8630e-01, 9.8475e-01, 9.8320e-01, 9.8163e-01,
        9.8007e-01, 9.7850e-01, 9.7692e-01, 9.7534e-01, 9.7375e-01, 9.7216e-01,
        9.7056e-01, 9.6896e-01, 9.6735e-01, 9.6573e-01, 9.6411e-01, 9.6249e-01,
        9.6086e-01, 9.5922e-01, 9.5758e-01, 9.5594e-01, 9.5428e-01, 9.5263e-01,
        9.5097e-01, 9.4930e-01, 9.4763e-01, 9.4595e-01, 9.4427e-01, 9.4258e-01,
        9.4089e-01, 9.3920e-01, 9.3749e-01, 9.3579e-01, 9.3407e-01, 9.3236e-01,
        9.3064e-01, 9.2891e-01, 9.2718e-01, 9.2544e-01, 9.2370e-01, 9.2195e-01,
        9.2020e-01, 9.1845e-01, 9.1669e-01, 9.1492e-01, 9.1315e-01, 9.1138e-01,
        9.0960e-01, 9.0781e-01, 9.0603e-01, 9.0423e-01, 9.0244e-01, 9.0063e-01,
        8.9883e-01, 8.9701e-01, 8.9520e-01, 8.9338e-01, 8.9155e-01, 8.8972e-01,
        8.8789

In [12]:
from audioldm import text_to_audio, build_model
import librosa
import soundfile as sf
import numpy as np
from pathlib import Path

# Load AudioLDM Model
def load_audioldm_model():
    return build_model(model_name="audioldm-s-full")

# Process Style Transformation
def style_transform(input_audio_path, text_prompt, duration, guidance_scale, random_seed):
    # Step 1: Generate styled audio with AudioLDM
    print("Loading AudioLDM model...")
    audioldm = load_audioldm_model()
    
    print("Generating audio in requested style...")
    generated_audio = text_to_audio(
        latent_diffusion=audioldm,
        text=text_prompt,
        seed=random_seed,
        duration=duration,
        guidance_scale=guidance_scale,
    )[0]  # First candidate

    # Step 2: Load original `other` audio
    print("Loading original audio...")
    original_audio, sr = librosa.load(input_audio_path, sr=16000)

    # Step 3: Align audio durations
    print("Aligning durations...")
    generated_audio = librosa.util.fix_length(generated_audio, len(original_audio))

    # Step 4: Blend original and generated audio
    print("Blending audio...")
    blended_audio = (original_audio + generated_audio) / 2  # Simple average blend

    # Save the blended audio
    output_path = "styled_audio.wav"
    sf.write(output_path, blended_audio, sr)
    return output_path

# Hardcoded values
if __name__ == "__main__":
    # Input parameters
    input_audio_path = Path("./audio_processing/output_stems/htdemucs_ft/input_5/other.mp3")
    text_prompt = "convert to jazz style" 
    duration = 5  # Duration in seconds
    guidance_scale = 2.5  # Guidance scale for AudioLDM
    random_seed = 42  # Random seed for reproducibility

    # Run style transformation
    print("Starting style transformation...")
    output_audio_path = style_transform(input_audio_path, text_prompt, duration, guidance_scale, random_seed)
    print(f"Styled audio saved to: {output_audio_path}")


Starting style transformation...
Loading AudioLDM model...
Load AudioLDM: %s audioldm-s-full
DiffusionWrapper has 185.04 M params.


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating audio in requested style...
Generate audio using text convert to jazz style


DDIM Sampler: 100%|██████████| 200/200 [04:27<00:00,  1.34s/it]


Loading original audio...


FileNotFoundError: [Errno 2] No such file or directory: 'audio_processing\\output_stems\\htdemucs_ft\\input_5\\other.mp3'

In [None]:
    output_audio_path = style_transform(input_audio_path, text_prompt, duration, guidance_scale, random_seed)
    print(f"Styled audio saved to: {output_audio_path}")