In [1]:
###### IMPORT ######
import torch 
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy.io.wavfile
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
###### VERIFYNG IF GPU IS AVAILABLE ######
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

if device == 'cuda:0':
    print(f'your device is {device}')
else:
    print(f'your device is {device}: it may takes more time, be patient😁')

your device is cpu: it may takes more time, be patient😁


In [None]:
###### LOADING PROCESSOR AND MODEL FROM HUGGING FACE🤗 ######
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
model.to(device) 

In [4]:
###### DEFINING PROMPTS ######
prompts = [
    "A 90s rock song with a distorted guitar and a fast drum beat",
    "A calm relaxing lofi hip hop beat for studying",
    "Epic cinematic music for a fantasy movie trailer",
    "A simple melancholic piano solo"
]

In [5]:
###### SETTING AUDIO DURATION ######
duration_in_seconds = 10
duration_in_tokens = int(duration_in_seconds * 50) 
# MusicGen uses an encoding-decoding model called EnCodec, which compresses audio into a 
# discrete sequence of tokens and vice versa. According to the official MusicGen 
# documentation, EnCodec operates at a frequency of 50 Hz. That means the model generates 
# 50 tokens for every second of audio.

In [6]:
###### DEFINING THE DIRECTORY WHERE WE'RE GOING TO SAVE THE GENERATED AUDIOS ######
audios_dir = '../data/generated/'
os.makedirs(audios_dir, exist_ok=True) # creates the directory if it doesn't exist.

In [8]:
###### CREATING AND SAVING AUDIOS ######
for i, prompt in enumerate(prompts):
    print(f'generating sample {i+1}/{len(prompts)}...')

    # processing prompts 
    inputs = processor(
        text = [prompt], # the text it has to translate
        padding = True,  # not usefull in this specific case, it's only for more robust code. It's used to make all prompts the same
                         # length if we pass more than one at a time, so that it can return a rectangular tensor.
        return_tensors = 'pt' # we're saying "return a Pytorch Tensor please!"
    ).to(device)

    # generating audios
    audio_values = model.generate(**inputs, max_new_tokens=duration_in_tokens)

    # preparing audios for saving
    sampling_rate = model.config.audio_encoder.sampling_rate # It accesses model's technical data sheet (config),
                                                             # navigates to the audio_encoder section and reads the
                                                             # sampling_rate at which the model's audio should be interpreted.
                                                             # For Musicgen-small this value is 32 kHz.
    
    audio_array = audio_values.cpu().numpy().squeeze() # Pytorch tensor is converted to a NumPy array as that's the format 
                                                       # understood by SciPy. Each number of this array represents the amplitude 
                                                       # of the sound wave at a specific moment in time.

    # saving audios 
    prompt_name_as_file = prompt.lower().replace(" ", "_")[:20]
    output_path = os.path.join(audios_dir, f"sample_{i+1}_{prompt_name_as_file}.wav") 
    scipy.io.wavfile.write(output_path, rate=sampling_rate, data=audio_array)
        # That literally means "take this data (audio_array) and save the file in this directory (output_path) by using this audio quality
        # (sampling_rate)"

        # A .wav file is like a container that holds not only the list of numbers (audio_array) but also essential information 
        # needed to interpret it, including the sampling rate: when the audio player reads this value it understands that to reproduce the
        # audio, it has to read (for example) 32.000 numbers from the array each second and send them to the speaker.
    print(f'sample {i+1} generated and saved succesfully in {output_path}!\n')

generating sample 1/4...
sample 1 generated and saved succesfully in ../data/generated/sample_1_a_90s_rock_song_with.wav!

generating sample 2/4...
sample 2 generated and saved succesfully in ../data/generated/sample_2_a_calm_relaxing_lofi.wav!

generating sample 3/4...
sample 3 generated and saved succesfully in ../data/generated/sample_3_epic_cinematic_music.wav!

generating sample 4/4...
sample 4 generated and saved succesfully in ../data/generated/sample_4_a_simple_melancholic.wav!

