# AI Music Generation

Hugging Face has released their music generation pipeline — Dance Diffusion. It can be seen [here](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines) alongwith the other Hugging Face diffusers pipelines.

This notebook is experiments with Dance Diffusion and attempts to generate music from Stable Diffusion generated images — either directly through the same prompt, or indirectly by passing the image thorugh CLIP (or some other component) to get captions/text descriptions back from the image and then passing that on to Dance Diffusion.

In [None]:
# Requirements for those who need this - uncomment below line(s) to run
# %pip install pytorch scipy diffusers tqdm transformers

In [2]:
import gc
import IPython.display as ipd
import os
import random
import torch
import scipy.io.wavfile
from tqdm.auto import tqdm
from transformers import logging
from typing import Tuple
from datetime import datetime
from diffusers import DanceDiffusionPipeline, UNet1DModel, IPNDMScheduler

# Supress some unnecessary warnings when loading the CLIPTextModel
logging.set_verbosity_error()

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.has_mps else 'cpu')
# Config info
outdir = 'audio'

NOTE: Redirects are currently not supported in Windows or MacOs.


In [3]:
# Create diffusers pipeline
# Pre-trained models - 'harmonai/maestro-150k', 'harmonai/jmann-small-190k', 'harmonai/honk-140k', 
# 'harmonai/unlocked-250k', 'harmonai/jmann-large-580k', 'harmonai/glitch-440k'
pipe = DanceDiffusionPipeline.from_pretrained('harmonai/maestro-150k').to(device)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# Generate music
def generate(seed: int = None):
    if seed is None:
        seed = random.randrange(2 ** 32 - 1)
    print(f'Seed: {seed}')
    generator = torch.Generator().manual_seed(seed)
    audio = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=3).audios[0]
    return audio, seed

# Save file and metadata
def save(audio, seed: int):
    dtstr = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    name = f'{dtstr}_{seed}.wav'
    path = os.path.join(outdir, name)
    # Save file
    scipy.io.wavfile.write(path, pipe.unet.sample_rate, audio.transpose())
    # Set metaddata didn't work with mutagen - reference: https://mutagen.readthedocs.io/en/latest/user/id3.html
    return path

In [5]:
# Create audio
audio, seed = generate()
file = save(audio, seed)
ipd.Audio(file)

Seed: 1092667905


  0%|          | 0/100 [00:00<?, ?it/s]

  timestep_index = (self.timesteps == timestep).nonzero().item()


Let's try handling the audio generation in a more hands-on fashion using a unet and a scheduler instead of relying on the diffusers pipeline.

In [6]:
model = 'harmonai/maestro-150k'
# Get unet
unet = UNet1DModel.from_pretrained(model, subfolder="unet").to(device)
# Get scheduler
scheduler = IPNDMScheduler(num_train_timesteps=100)

# Save file and metadata
def save(audio, seed: int):
    dtstr = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    name = f'{dtstr}_{seed}.wav'
    path = os.path.join(outdir, name)
    # Save file
    scipy.io.wavfile.write(path, unet.sample_rate, audio.transpose())
    # Set metaddata didn't work with mutagen - reference: https://mutagen.readthedocs.io/en/latest/user/id3.html
    return path

In [7]:
# Audio generator
def generate(steps: int = 100, seed: int = None, length: float = None) -> Tuple:    
    # Generator
    if seed is None:
        seed = random.randrange(2 ** 32 - 1)
    generator = torch.Generator().manual_seed(seed)
    # Set length
    if length is None:
        length = unet.config.sample_size / unet.config.sample_rate
        
    sample_size = int(length * unet.sample_rate)
    down_scale_factor = 2 ** len(unet.up_blocks)
    if sample_size < 3 * down_scale_factor:
        print(f"{length} is too small. Make sure it's bigger or equal to {3 * down_scale_factor / unet.sample_rate}."
        )
        exit()

    original_sample_size = sample_size
    if sample_size % down_scale_factor != 0:
        sample_size = int(((length * unet.sample_rate) // down_scale_factor + 1) * down_scale_factor)
        print(
            f"{length} is increased to {sample_size / unet.sample_rate} so that it can be handled"
            f" by the model. It will be cut to {original_sample_size / unet.sample_rate} after the denoising"
            " process.")

    # Set step values
    scheduler.set_timesteps(steps, device=device)
    # Initial noise
    audio = torch.randn((1, unet.in_channels, sample_size), generator=generator).to(device)
    # Loop through time steps
    for i, t in tqdm(enumerate(scheduler.timesteps)):
        # Predict noise model_output
        with torch.no_grad():
            pred = unet(audio, t).sample
        # Get previous sample
        audio = scheduler.step(pred, t, audio).prev_sample

    audio = audio.clamp(-1, 1).float().cpu().detach().numpy()
    audio = audio[:, :, :original_sample_size]
    return audio, seed

In [8]:
audio, seed = generate(length=3)
file = save(audio, seed)
gc.collect()
ipd.Audio(file)

3 is increased to 3.072 so that it can be handled by the model. It will be cut to 3.0 after the denoising process.


0it [00:00, ?it/s]

Can we now add a text prompt to the mix so that we can generate music based on text input?

Trouble is, the `UNet1DModel` does not appear to have a way at present to take a text prompt, or even a text embedding and then use that to generate music. So how do we go about supporting text embeddings?