In [1]:
!pip install git+https://github.com/suno-ai/bark.git

Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-9unawq_z
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-9unawq_z
  Resolved https://github.com/suno-ai/bark.git to commit f4f32d4cd480dfec1c245d258174bc9bde3c2148
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting encodec (from suno-bark==0.0.1a0)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: suno-bark, encodec
  Building wheel for suno-bark (pyproject.toml) ... [?25l[?25hdone
  Created wheel for suno-bark: filename=suno_bark-0.0.1a0-py3-none-any.

In [None]:
import gradio as gr
from transformers import AutoProcessor, MusicgenForConditionalGeneration, BarkModel
from transformers import AutoProcessor as BarkProcessor
import torch
import scipy.io.wavfile as wavfile
import numpy as np
import librosa
import tempfile
import gc

# ==========================================
#   HELPER: VRAM CLEANER
# ==========================================
def flush():
    """Forces the GPU to release memory between steps"""
    gc.collect()
    torch.cuda.empty_cache()

# ==========================================
#   STEP 1: GENERATE VOCALS (BARK)
# ==========================================
def step_1_vocals(lyrics_text, voice_preset):
    print(f"--- [1/3] Starting Vocals Generation (Voice: {voice_preset}) ---")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load Bark (Small version is safer for VRAM)
    processor = BarkProcessor.from_pretrained("suno/bark-small")
    model = BarkModel.from_pretrained("suno/bark-small").to(device)
    
    # Add music notes to prompt to trigger singing mode
    formatted_prompt = f"♪ {lyrics_text} ♪"
    
    # CHANGED: Now using the voice_preset variable passed from the button
    inputs = processor(formatted_prompt, voice_preset=voice_preset).to(device)
    
    # Generate
    audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().squeeze()
    
    # Get sample rate before deleting model
    sr = model.generation_config.sample_rate
    
    # Cleanup
    del model
    del processor
    del inputs
    flush()
    
    return audio_array, sr

# ==========================================
#   STEP 2: GENERATE MUSIC (MUSICGEN SMALL)
# ==========================================
def step_2_music(style_text):
    print("--- [2/3] Starting Music Generation (No Audio Conditioning) ---")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Using 'musicgen-small'
    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to(device)
    
    inputs = processor(
        text=[style_text],
        padding=True,
        return_tensors="pt",
    ).to(device)
    
    # Generate
    audio_values = model.generate(
        **inputs,
        do_sample=True,
        guidance_scale=3,
        max_new_tokens=768 
    )
    
    music_array = audio_values[0].cpu().numpy().squeeze()
    
    # Cleanup
    del model
    del processor
    del inputs
    flush()
    
    return music_array, 32000

# ==========================================
#   STEP 3: MIXER & MAIN PIPELINE
# ==========================================
def full_song_pipeline(lyrics, style, voice_choice, progress=gr.Progress()):
    try:
        # MAP BUTTON CHOICE TO BARK PRESET ID
        if voice_choice == "Female":
            selected_voice = "v2/en_speaker_9"
        else:
            selected_voice = "v2/en_speaker_6" # Default Male

        progress(0.1, desc="Warming up the singer (Bark)...")
        
        # 1. Generate Vocals (Pass the selected voice)
        vocab_raw, vocab_sr = step_1_vocals(lyrics, selected_voice)
        
        progress(0.4, desc="The band is playing (MusicGen Small)...")
        
        # 2. Generate Music 
        music_raw, music_sr = step_2_music(style)
        
        progress(0.8, desc="Mixing final track...")
        
        # 3. Mixing
        if vocab_sr != 32000:
            vocab_final = librosa.resample(vocab_raw, orig_sr=vocab_sr, target_sr=32000)
        else:
            vocab_final = vocab_raw

        max_len = max(len(vocab_final), len(music_raw))
        
        vocab_padded = np.pad(vocab_final, (0, max_len - len(vocab_final)))
        music_padded = np.pad(music_raw, (0, max_len - len(music_raw)))
        
        if np.abs(vocab_padded).max() > 0:
            vocab_padded = vocab_padded / np.abs(vocab_padded).max()
        if np.abs(music_padded).max() > 0:
            music_padded = music_padded / np.abs(music_padded).max()
        
        mixed_audio = (vocab_padded * 0.6) + (music_padded * 0.4)
        
        mixed_audio = (mixed_audio * 32767).astype(np.int16)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            wavfile.write(tmp_file.name, 32000, mixed_audio)
            return tmp_file.name
            
    except Exception as e:
        raise gr.Error(f"Pipeline crashed: {str(e)}")

# ==========================================
#   GRADIO INTERFACE
# ==========================================
css = """
.container {max-width: 800px; margin: auto; padding-top: 20px}
"""

with gr.Blocks(css=css, title="One-Click Music Generator") as demo:
    gr.Markdown("# AI Song Maker")
    gr.Markdown("Enter lyrics and style.")
    
    with gr.Group():
        # NEW: Voice Selection Button
        voice_btn = gr.Radio(
            choices=["Male", "Female"], 
            value="Male", 
            label="Singer Voice",
            interactive=True
        )

        lyrics_input = gr.Textbox(
            label="Lyrics", 
            placeholder="Type your song lyrics here...", 
            lines=3,
            value="I’m focused.\nLocked in.\nNo breaks, NO SLEEP.\nno fear.\nNo Fear.\nI stay on time."
        )
        
        style_input = gr.Textbox(
            label="Music Style", 
            placeholder="E.g., Jazz, Rock, Synthwave...", 
            value="Modern motivational rap track, medium-fast tempo around 100 BPM, confident and determined mood."
        )
        
        generate_btn = gr.Button("Generate Full Song", variant="primary", size="lg")
    
    output_audio = gr.Audio(label="Your Generated Song", type="filepath")
    
    # Trigger
    generate_btn.click(
        fn=full_song_pipeline,
        inputs=[lyrics_input, style_input, voice_btn], # Added voice_btn here
        outputs=[output_audio]
    )

print("Launching App...")
demo.launch(share=True, debug=True)

2025-12-21 17:40:13.304858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766338813.469940      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766338813.510554      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766338813.856615      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766338813.856649      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766338813.856652      55 computation_placer.cc:177] computation placer alr

Launching App...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://75c1b211d00377fb18.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


--- [1/3] Starting Vocals Generation ---


tokenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

speaker_embeddings_path.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

speaker_embeddings/v2/en_speaker_6_seman(…):   0%|          | 0.00/2.60k [00:00<?, ?B/s]

speaker_embeddings/v2/en_speaker_6_coars(…):   0%|          | 0.00/7.55k [00:00<?, ?B/s]

speaker_embeddings/v2/en_speaker_6_fine_(…):   0%|          | 0.00/15.0k [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
--- [1/3] Starting Vocals Generation ---


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


--- [2/3] Starting Music Generation (No Audio Conditioning) ---
