# HuggingFace y Datasets

## Instrucciones de Instalacion

Antes de empezar, verificar si tienen instalado las librerias necesarias.
En su configuracion personal, pueden instalar desde el directorio base del repositorio usando `pip install -r installer/requirements.txt`,
ya sea para Windows o Linux.

En caso de usar Colab, pueden copiar el contenido en un archivo dentro de Colab, o en su propia nube.
Ingresar la linea de codigo: `!pip install -r <folder>/requirements.txt` dentro de una celda, y presionar en `Ejecutar celda`

### HuggingFace_Hub

Hugging Face Hub, el sitio web principal, es una plataforma central que permite a cualquiera descubrir, utilizar y contribuir con nuevos modelos y conjuntos de datos de última generación. Alberga una amplia variedad de modelos, con más de 10.000 disponibles públicamente.

In [None]:
import soundfile as sf
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch

CACHE_DIR="../models"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16

processor = AutoProcessor.from_pretrained(
    "facebook/musicgen-small",
    cache_dir=CACHE_DIR,
)
model = MusicgenForConditionalGeneration.from_pretrained(
    "facebook/musicgen-small",
    cache_dir=CACHE_DIR,
).to(device)
sampling_rate = model.config.audio_encoder.sampling_rate

caption = ""
extras = {}

inputs = processor(
    text=[caption],
    padding=True,
    return_tensors="pt",
    **extras
).to(device)

audio_out = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=1024).squeeze()
audio_out = audio_out.cpu().numpy()
name_out = "generated/music_out.wav"
sf.write(name_out, audio_out, sampling_rate)

Ahora con otro modelo:

In [None]:
from PIL import Image
import numpy as np
import torchaudio


def spectrogram_from_image(
    image: Image.Image,
    power: float = 0.25,
    stereo: bool = False,
    max_value: float = 30e6,
) -> np.ndarray:
    """
    Compute a spectrogram magnitude array from a spectrogram image.

    This is the inverse of image_from_spectrogram, except for discretization error from
    quantizing to uint8.

    Args:
        image: (frequency, time, channels)
        power: The power curve applied to the spectrogram
        stereo: Whether the spectrogram encodes stereo data
        max_value: The max value of the original spectrogram. In practice doesn't matter.

    Returns:
        spectrogram: (channels, frequency, time)
    """
    # Convert to RGB if single channel
    if image.mode in ("P", "L"):
        image = image.convert("RGB")

    # Flip Y
    image = image.transpose(Image.Transpose.FLIP_TOP_BOTTOM)

    # Munge channels into a numpy array of (channels, frequency, time)
    data = np.array(image).transpose(2, 0, 1)
    if stereo:
        # Take the G and B channels as done in image_from_spectrogram
        data = data[[1, 2], :, :]
    else:
        data = data[0:1, :, :]

    # Convert to floats
    data = data.astype(np.float32)

    # Invert
    data = 255 - data

    # Rescale to 0-1
    data = data / 255

    # Reverse the power curve
    data = np.power(data, 1 / power)

    # Rescale to max value
    data = data * max_value

    return data


def get_inverter(n_fft, num_griffin_lim_iters, win_length, hop_length, device):
    inverse_spectrogram_func = torchaudio.transforms.GriffinLim(
        n_fft=n_fft,
        n_iter=num_griffin_lim_iters,
        win_length=win_length,
        hop_length=hop_length,
        window_fn=torch.hann_window,
        power=1.0,
        wkwargs=None,
        momentum=0.99,
        length=None,
        rand_init=True,
    ).to(device)
    return inverse_spectrogram_func


def audio_from_spectrogram(
    self,
    spectrogram: np.ndarray,
    apply_filters: bool = True,
    normalize: bool = True, 
):
    """
    Reconstruct an audio segment from a spectrogram.

    Args:
        spectrogram: (batch, frequency, time)
        apply_filters: Post-process with normalization and compression

    Returns:
        audio: Audio segment with channels equal to the batch dimension
    """
    # Move to device
    amplitudes_mel = torch.from_numpy(spectrogram).to(self.device)

    # Reconstruct the waveform
    waveform = waveform_from_mel_amplitudes(amplitudes_mel)

    # Convert to audio segment
    if normalize:
        waveform *= np.iinfo(np.int16).max / np.max(np.abs(waveform))
        
    # apply filers:
    # compression (effects normalize)
    # compress dynamic range
    # librosa.mu_compress

    return waveform

In [None]:
from diffusers import StableDiffusionPipeline

pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    cache_dir=CACHE_DIR,
)

pipe = pipe.to(device)

# model =  model.to_bettertransformer()
pipe.enable_cpu_offload()

mensaje = ""
# sampling_rate = model.generation_config.sample_rate

image = pipe(mensaje).images[0]
PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')


Convertimos de imagen a Sonido

In [None]:
max_value: float = 30e6
power_for_image: float = 0.25
stereo: bool = False

spectrogram = spectrogram_from_image(
    image,
    max_value=max_value,
    power=power_for_image,
    stereo=stereo,
)

segment = audio_from_spectrogram(
    spectrogram,
    apply_filters=True,
)

### Ahora cargamos bark




In [None]:
import torch
import soundfile as sf
from datetime import datetime
from transformers import AutoProcessor, BarkModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16

processor = AutoProcessor.from_pretrained(
    "suno/bark-small",
    cache_dir=CACHE_DIR,
)
model = BarkModel.from_pretrained(
    "suno/bark",
    cache_dir="./models",
).to(device)

model.enable_cpu_offload()
sampling_rate = model.generation_config.sample_rate

captions = ""
voice_preset = "v2/es_speaker_1"

inputs = processor(caption, voice_preset=voice_preset).to(device)
audio_out = model.generate(**inputs).squeeze().cpu().numpy()

time_now = datetime.now().strftime("%Y%m%d_%H%M%S")
name_out = f"generated/speech_{time_now}.wav"
sf.write(name_out, audio_out, sampling_rate)

Finalmente Descargamos T5-small y revisamos su salida

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
outputs = model.generate(input_ids)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Datasets

Datasets es una biblioteca para acceder y compartir fácilmente conjuntos de datos para tareas de audio, visión por computadora y procesamiento del lenguaje natural (NLP).

Cargue un conjunto (base) de datos en una sola línea de código y utilice los potentes métodos de procesamiento de datos para preparar rápidamente su conjunto de datos para entrenarlo en un modelo de aprendizaje profundo. Con el respaldo del formato Apache Arrow, procese grandes conjuntos de datos con lecturas sin copia y sin restricciones de memoria para lograr una velocidad y eficiencia óptimas.

In [None]:
from datasets import load_dataset

dataset = load_dataset("ylacombe/google-chilean-spanish", cache_dir=CACHE_DIR)

Utilizando un conjunto de datos de texto, preprocesamos utilizando diferente funciones:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)

dataset[0]