In [78]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
pd.options.plotting.backend = 'plotly' 
pd.set_option('display.max_columns', None)

# Text

In [3]:
input_text = """Iron Man is a superhero appearing in American comic books published by Marvel Comics.
 Co-created by writer and editor Stan Lee, developed by scripter Larry Lieber,
  and designed by artists Don Heck and Jack Kirby, the character first appeared in Tales of Suspense #39 in 1963
  , and received his own title with Iron Man #1 in 1968. Shortly after his creation, Iron Man was a founding member of a superhero team,
   the Avengers, with Thor, Ant-Man, Wasp and the Hulk. Iron Man stories, individually and with the Avengers, have been published consistently
    since the character's creation."""

#input_text = ["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."]

# Bark

In [4]:
from IPython.display import Audio
from transformers import AutoProcessor, AutoModel

import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

# Move the model to the GPU if one is available
model = model.to(device)
#model = model.to_bettertransformer()(device)

# Prepare the inputs
inputs = processor(
    text=input_text,
    return_tensors="pt",
)

# Move the inputs to the GPU if one is available
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# Access the attention mask
attention_mask = inputs["attention_mask"]

# Modify the attention mask as needed
# For example, to ignore the second token, you can do:
attention_mask[0][1] = 0

# Update the inputs dictionary with the new attention mask
inputs["attention_mask"] = attention_mask

# Generate the audio data
audio_tensor = model.generate(**inputs, do_sample=True, output_attentions=False, temperature=0.5)

# Convert the tensor to a numpy array
# Note: You need to move the tensor back to the CPU before converting it to a numpy array
audio_array = audio_tensor.cpu().detach().numpy()

# Determine the sample rate
# Note: You may need to adjust this value depending on the model's output
sample_rate = 22050#22050

# Play the audio
Audio(audio_array, rate=sample_rate)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


## Bark HugginFace Guide

In [None]:
https://huggingface.co/blog/optimizing-bark#benchmark-results

In [1]:
from transformers import BarkModel

model = BarkModel.from_pretrained("suno/bark-small")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [5]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("suno/bark-small")

In [8]:
#text_prompt = "Let's try generating speech, with Bark, a text-to-speech model"
text_prompt = "Iron Man is a superhero appearing in American comic books published by Marvel Comics."
inputs = processor(text_prompt).to(device)

In [20]:
import torch
from transformers import set_seed


def measure_latency_and_memory_use(model, inputs, nb_loops = 5):

  # define Events that measure start and end of the generate pass
  start_event = torch.cuda.Event(enable_timing=True)
  end_event = torch.cuda.Event(enable_timing=True)

  # reset cuda memory stats and empty cache
  torch.cuda.reset_peak_memory_stats(device)
  torch.cuda.empty_cache()
  torch.cuda.synchronize()

  # get the start time
  start_event.record()

  # actually generate
  for _ in range(nb_loops):
        # set seed for reproducibility
        set_seed(0)
        output = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = .9)

  # get the end time
  end_event.record()
  torch.cuda.synchronize()

  # measure memory footprint and elapsed time
  max_memory = torch.cuda.max_memory_allocated(device)
  elapsed_time = start_event.elapsed_time(end_event) * 1.0e-3

  print('Execution time:', elapsed_time/nb_loops, 'seconds')
  print('Max memory footprint', max_memory*1e-9, ' GB')

  return output


### Standard Method

In [9]:
with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generati

Execution time: 5.2773671874999994 seconds
Max memory footprint 1.9008245760000002  GB


In [10]:
from IPython.display import Audio

# now, listen to the output
sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

### Better Trasformer Text

In [17]:
model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
processor = AutoProcessor.from_pretrained("suno/bark-small")
inputs = processor(text_prompt).to(device)



In [18]:
model =  model.to_bettertransformer()

with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 1)


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Execution time: 23.1171171875 seconds
Max memory footprint 7.063555584  GB


In [21]:
from IPython.display import Audio

# now, listen to the output
sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

In [19]:
#1.5 corse
from IPython.display import Audio

# now, listen to the output
sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

In [15]:
#.2 corse
from IPython.display import Audio

# now, listen to the output
sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

In [11]:
#.8 Coasrse
from IPython.display import Audio

# now, listen to the output
sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

### Half Size Version of small bark

In [15]:
model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
model =  model.to_bettertransformer()

with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 5)


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Execution time: 3.871321484375 seconds
Max memory footprint 2.74146304  GB


### CPU offload

In [16]:
model = BarkModel.from_pretrained("suno/bark-small")

# Enable CPU offload
model.enable_cpu_offload()

with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 5)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generati

Execution time: 5.96422890625 seconds
Max memory footprint 3.53308416  GB


### Combined Method

In [32]:
# load in fp16
#model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
model = BarkModel.from_pretrained("suno/bark-small").to(device)
# convert to bettertransformer
model =  model.to_bettertransformer()

# enable CPU offload
#model.enable_cpu_offload()

with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 5)


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


KeyboardInterrupt: 

In [33]:
#Real Test
model = BarkModel.from_pretrained("suno/bark-small").to(device)
# convert to bettertransformer
model =  model.to_bettertransformer()

# enable CPU offload
#model.enable_cpu_offload()

with torch.inference_mode():
  speech_output = measure_latency_and_memory_use(model, inputs, nb_loops = 1)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Execution time: 14.45628125 seconds
Max memory footprint 3.6805657600000004  GB


In [34]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

## Brandon Bark Method

In [49]:
text_prompt = 'Iron Man is a superhero appearing in American comic books published by Marvel Comics'
#text_prompt = "Let's try generating speech, with Bark, a text-to-speech model"

In [50]:
inputs = processor(text_prompt).to(device)

In [51]:
from transformers import BarkModel

model = BarkModel.from_pretrained("suno/bark-small")

In [52]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model =  model.to_bettertransformer()
#model.enable_cpu_offload()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [53]:
# set seed for reproducibility
set_seed(0)
output = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = 0.8)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [54]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(output[0].cpu().numpy(), rate=sampling_rate)

## Brandon try 2

In [None]:
text_prompt = ['Iron Man is a superhero appearing in American comic books published by Marvel Comics',
 'Co-created by writer and editor Stan Lee, developed by scripter Larry Lieber,\nand designed by artists Don Heck and Jack Kirby, the character first appeared in Tales of Suspense #39 in 1963\n, and received his own title with Iron Man #1 in 1968',
 'Shortly after his creation, Iron Man was a founding member of a superhero team,\nthe Avengers, with Thor, Ant-Man, Wasp and the Hulk',
 "Iron Man stories, individually and with the Avengers, have been published consistently\nsince the character's creation."]


In [47]:
from IPython.display import Audio
from transformers import AutoProcessor, AutoModel

import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

# Move the model to the GPU if one is available
model = model.to(device)
#model = model.to(device)
model =  model.to_bettertransformer()
#model.enable_cpu_offload()

# Prepare the inputs
inputs = processor(
    text=text_prompt,
    return_tensors="pt",
)

# Move the inputs to the GPU if one is available
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# Access the attention mask
attention_mask = inputs["attention_mask"]

# Modify the attention mask as needed
# For example, to ignore the second token, you can do:
attention_mask[0][1] = 0

# Update the inputs dictionary with the new attention mask
inputs["attention_mask"] = attention_mask

set_seed(0)
# Generate the audio data
audio_tensor = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = 0.8)

# Convert the tensor to a numpy array
# Note: You need to move the tensor back to the CPU before converting it to a numpy array
audio_array = audio_tensor.cpu().detach().numpy()

# Determine the sample rate
# Note: You may need to adjust this value depending on the model's output
sample_rate = 22050#22050

# Play the audio
Audio(audio_array, rate=sample_rate)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


KeyboardInterrupt: 

In [None]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

## Copilot Version

In [48]:
from transformers import AutoProcessor, AutoModel
from IPython.display import Audio
import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

# Move the model to the GPU if one is available
model = model.to(device)
model =  model.to_bettertransformer()
#model.enable_cpu_offload()
# Initialize an empty list to store the audio arrays
audio_arrays = []

text_prompt = ['Iron Man is a superhero appearing in American comic books published by Marvel Comics',
 'Co-created by writer and editor Stan Lee, developed by scripter Larry Lieber,\nand designed by artists Don Heck and Jack Kirby, the character first appeared in Tales of Suspense #39 in 1963\n, and received his own title with Iron Man #1 in 1968',
 'Shortly after his creation, Iron Man was a founding member of a superhero team,\nthe Avengers, with Thor, Ant-Man, Wasp and the Hulk',
 "Iron Man stories, individually and with the Avengers, have been published consistently\nsince the character's creation."]

for text in text_prompt:
    # Prepare the inputs
    inputs = processor(
        text=[text],
        return_tensors="pt",
    )

    # Move the inputs to the GPU if one is available
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # Generate the audio data
    set_seed(0)
    audio_tensor = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = 0.8)

    # Convert the tensor to a numpy array
    # Note: You need to move the tensor back to the CPU before converting it to a numpy array
    audio_array = audio_tensor.cpu().detach().numpy()

    # Append the audio array to the list
    audio_arrays.append(audio_array)

# Combine the audio arrays
combined_audio_array = np.concatenate(audio_arrays)

# Determine the sample rate
# Note: You may need to adjust this value depending on the model's output
sample_rate = 22050

# Play the combined audio
Audio(combined_audio_array, rate=sample_rate)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


KeyboardInterrupt: 

## GPT3.5 Version

In [1]:
text_prompts = ["Iron Man is a superhero appearing in American comic books published by Marvel Comics."]


In [4]:
import torch
from transformers import AutoProcessor, BarkModel
from torch.cuda.amp import autocast

# Assuming text_prompts is a list of sentences


model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
processor = AutoProcessor.from_pretrained("suno/bark-small")

# Batch processing
inputs = processor(text_prompts, return_tensors="pt", truncation=True).to(device)#  padding=True,

# Model to bettertransformer
model = model.to_bettertransformer()

with torch.no_grad(), autocast("cuda" if device.startswith("cuda") else "cpu"):
    # Measure latency and memory use
    speech_output = measure_latency_and_memory_use(model, inputs, nb_loops=1)

# Combine the outputs if needed
combined_output = torch.cat(speech_output, dim=0)

from IPython.display import Audio

# Listen to the combined output
sampling_rate = model.generation_config.sample_rate
Audio(combined_output.cpu().numpy(), rate=sampling_rate)


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


NameError: name 'set_seed' is not defined

In [5]:
import torch
from transformers import AutoProcessor, BarkModel
from torch.cuda.amp import autocast

# Set the seed
torch.manual_seed(0)

# Assuming text_prompts is a list of sentences
model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
processor = AutoProcessor.from_pretrained("suno/bark-small")

# Batch processing
inputs = processor(text_prompts, return_tensors="pt", truncation=True).to(device)#  padding=True,

# Model to bettertransformer
model = model.to_bettertransformer()

with torch.no_grad(), autocast("cuda" if device.startswith("cuda") else "cpu"):
    # Measure latency and memory use
    speech_output = measure_latency_and_memory_use(model, inputs, nb_loops=1)

# Combine the outputs if needed
combined_output = torch.cat(speech_output, dim=0)

from IPython.display import Audio

# Listen to the combined output
sampling_rate = model.generation_config.sample_rate
Audio(combined_output.cpu().numpy(), rate=sampling_rate)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


TypeError: enabled must be a bool (got str)