In [1]:
# install bark as well as pytorch nightly to get blazing fast flash-attention
!pip install git+https://github.com/suno-ai/bark.git && \
  pip uninstall -y torch torchvision torchaudio && \
  pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
!pip install pydub
!git clone https://github.com/DrewThomasson/bark.git

Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-adlmi1kf
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-adlmi1kf
  Resolved https://github.com/suno-ai/bark.git to commit 773624d26db84278a55aacae9a16d7b25fbccab8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from suno-bark==0.0.1a0)
  Downloading boto3-1.28.61-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting encodec (from suno-bark==0.0.1a0)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[

In [8]:
#bark genreate audiobook gui
import csv
import nltk
import numpy as np
import os
import subprocess
from bark.generation import generate_text_semantic, preload_models
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE
from IPython.display import display, Audio, clear_output
from scipy.io.wavfile import write as write_wav
import ipywidgets as widgets

# Download and setup
nltk.download('punkt')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
preload_models()

# Function to get available voice actors
def get_voice_actors():
    result = subprocess.run(
        ["python", "/content/bark/bark_perform.py", "--list_speakers"],
        stdout=subprocess.PIPE,
        text=True
    )

    if result.returncode != 0:
        print("Failed to fetch voice actors!")
        return []

    lines = result.stdout.splitlines()
    voice_actors = []
    for line in lines:
        if line.strip() and not line.startswith(" "):
            continue
        split_line = line.strip().split()
        if split_line:
            voice_actor = split_line[0]  # Extracting voice actor's name
            voice_actors.append(f"v2/{voice_actor}")

    return voice_actors


# Populate voice_actors using the function
voice_actors = get_voice_actors()

# Constants
GEN_TEMP = 0.6
silence = np.zeros(int(0.25 * SAMPLE_RATE))

def generate_long_form_audio(text, speaker):
    sentences = nltk.sent_tokenize(text)
    pieces = []
    if len(text.split()) > 512:
        for sentence in sentences:
            semantic_tokens = generate_text_semantic(
                sentence,
                history_prompt=speaker,
                temp=GEN_TEMP,
                min_eos_p=0.05,
            )
            audio_array = semantic_to_waveform(semantic_tokens, history_prompt=speaker)
            pieces += [audio_array, silence.copy()]
    else:
        for sentence in sentences:
            audio_array = generate_audio(sentence, history_prompt=speaker)
            pieces += [audio_array, silence.copy()]
    return np.concatenate(pieces)

def get_unique_speakers():
    speakers = set()
    with open("book.csv", "r") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for row in csv_reader:
            speaker_name = row[3]
            speakers.add(speaker_name)
    return list(speakers)

def generate_audio_files():
    with open("book.csv", "r") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for idx, row in enumerate(csv_reader):
            text = row[0]
            speaker_name = row[3]
            if speaker_name not in speakers_dict:
                continue
            speaker = speakers_dict[speaker_name]
            print(f"Generating audio for the text: {text}")
            audio_data = generate_long_form_audio(text, speaker)

            file_name = f"audio_{idx}"
            write_wav(f"{file_name}.wav", SAMPLE_RATE, audio_data)
    messagebox.showinfo("Info", "Finished generating audio files!")

# Callbacks
def on_voice_actor_dropdown_change(change, speaker_name):
    if change['name'] == 'value' and (change['new'] is not None):
        speakers_dict[speaker_name] = change['new']

def on_generate_audio_button_click(b):
    generate_audio_files()
    with output:
        clear_output(wait=True)
        print("Finished generating audio files!")

speakers = get_unique_speakers()
speakers_dict = {}

# Create dynamic dropdowns for each speaker
dropdowns = []
for speaker in speakers:
    dropdown = widgets.Dropdown(options=voice_actors, description=speaker)
    dropdown.observe(lambda change, speaker=speaker: on_voice_actor_dropdown_change(change, speaker))
    dropdowns.append(dropdown)

generate_audio_button = widgets.Button(description="Generate Audio Files")
generate_audio_button.on_click(on_generate_audio_button_click)

output = widgets.Output()

# Display dropdowns and button
for dropdown in dropdowns:
    display(dropdown)
display(generate_audio_button, output)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dropdown(description="Soren's father", options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_spe…

Dropdown(description='Noctus', options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_speaker_2',…

Dropdown(description='Kludd', options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_speaker_2', …

Dropdown(description="Soren's mother", options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_spe…

Dropdown(description='Narrator', options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_speaker_2…

Dropdown(description='Soren', options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_speaker_2', …

Dropdown(description='Mrs. Plithiver', options=('v2/Polish:', 'v2/pl_speaker_0', 'v2/pl_speaker_1', 'v2/pl_spe…

Button(description='Generate Audio Files', style=ButtonStyle())

Output()

Generating audio for the text: The Capture,
Guardians of Ga’hoole,
By Kathryn Lasky,
CHAPTER ONE,
A Nest Remembered,



Noctus, can you spare a bit more down, darling? I think our third little one is about to arrive. That egg is beginning to crack.”


 26%|██▋       | 202/768 [01:00<02:50,  3.32it/s]


In [None]:
#combine audio gui
import os
import torch
import torchaudio
from ipywidgets import widgets, HBox, VBox
from IPython.display import display, Audio

combined_file_path = os.path.join(os.getcwd(), "combined_audio.wav")

def combine_audio_files(silence_duration_ms):
    folder_path = os.getcwd()
    files = sorted([f for f in os.listdir(folder_path) if f.startswith("audio_") and f.endswith(".wav")],
                   key=lambda f: int(f.split('_')[1].split('.')[0]))

    combined_tensor = torch.Tensor()

    for index, file in enumerate(files):
        waveform, sample_rate = torchaudio.load(os.path.join(folder_path, file))

        channels = waveform.shape[0]
        silence_tensor = torch.zeros(channels, int(silence_duration_ms * sample_rate / 1000))
        combined_tensor = torch.cat([combined_tensor, waveform, silence_tensor], dim=1)

        progress.value = (index + 1) / len(files)

    torchaudio.save(combined_file_path, combined_tensor, sample_rate)
    progress.value = 1
    progress_label.value = "Complete!"
    play_button.disabled = False  # Enable the play button after combining

def start_combining(change):
    silence_duration = silence_duration_slider.value
    combine_audio_files(silence_duration)

def play_audio(change):
    display(Audio(combined_file_path))

silence_duration_slider = widgets.FloatSlider(value=0, min=0, max=2000, step=1, description='Silence Duration (ms)')
silence_duration_label = widgets.Label(value=f'Silence Duration: {silence_duration_slider.value} ms')

def update_duration_label(change):
    silence_duration_label.value = f'Silence Duration: {silence_duration_slider.value} ms'
silence_duration_slider.observe(update_duration_label, 'value')

combine_button = widgets.Button(description="Combine Audio Files")
combine_button.on_click(start_combining)

play_button = widgets.Button(description="Play Combined Audio", disabled=True)
play_button.on_click(play_audio)

progress = widgets.FloatProgress(value=0, min=0, max=1, description='Progress:')
progress_label = widgets.Label(value='0%')

display(VBox([silence_duration_slider, silence_duration_label, HBox([combine_button, play_button]), progress, progress_label]))



In [5]:
!python /content/bark/bark_perform.py --list_speakers

Available history prompts:

  Polish:
    pl_speaker_0  
    pl_speaker_1  
    pl_speaker_2  
    pl_speaker_3  
    pl_speaker_4  
    pl_speaker_5  
    pl_speaker_6  
    pl_speaker_7  
    pl_speaker_8  
    pl_speaker_9  

  Spanish:
    es_speaker_0  
    es_speaker_1  
    es_speaker_2  
    es_speaker_3  
    es_speaker_4  
    es_speaker_5  
    es_speaker_6  
    es_speaker_7  
    es_speaker_8  
    es_speaker_9  

  Russian:
    ru_speaker_0  
    ru_speaker_1  
    ru_speaker_2  
    ru_speaker_3  
    ru_speaker_4  
    ru_speaker_5  
    ru_speaker_6  
    ru_speaker_7  
    ru_speaker_8  
    ru_speaker_9  

  French:
    fr_speaker_0  
    fr_speaker_1  
    fr_speaker_2  
    fr_speaker_3  
    fr_speaker_4  
    fr_speaker_5  
    fr_speaker_6  
    fr_speaker_7  
    fr_speaker_8  
    fr_speaker_9  

  Japanese:
    ja_speaker_0  
    ja_speaker_1  
    ja_speaker_2  
    ja_speaker_3  
    ja_speaker_4  
    ja_speaker_5  
    ja_speaker_6  
    ja_speaker_7  
  