In [None]:
import os
import requests
from dotenv import load_dotenv
from huggingface_hub import login

In [None]:
load_dotenv(override=True)
hf_token = os.getenv('HF_TOKEN')
login(token=hf_token)
os.environ["XFORMERS_DISABLE_TRITON"] = "1"

## Will only work on powerful GPUs

In [None]:
# import torch
# from diffusers import FluxPipeline
#
# FLUX_MODEL ="black-forest-labs/FLUX.1-schnell"
#
# pipe = FluxPipeline.from_pretrained(FLUX_MODEL, torch_dtype=torch.bfloat16).to("cuda")
#
# generator = torch.Generator(device="cuda").manual_seed(0)
# prompt = "A futuristic class full of students learning AI coding in the surreal style of Salvador Dali"
# image = pipe(
#     prompt,
#     guidance_scale=0.0,
#     num_inference_steps=3,
#     max_sequence_length=256,
#     generator=generator,
# ).images[0]
#
# image.save("surreal.png")

In [None]:
# Import required libraries
from transformers import pipeline  # Hugging Face's transformers library for pre-trained models
from datasets import load_dataset  # For loading voice embedding datasets
import soundfile as sf  # For saving audio files
import torch  # PyTorch for tensor operations

In [None]:
# Define constants for model and voice embeddings
TTS_MODEL = "microsoft/speecht5_tts"  # Microsoft's SpeechT5 text-to-speech model
EMBEDDING_DATASET = "Matthijs/cmu-arctic-xvectors"  # Dataset containing speaker voice embeddings

# Initialize the text-to-speech pipeline with CUDA acceleration
synthesiser = pipeline(
    task="text-to-speech",
    model=TTS_MODEL,
    device='cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available
)

# Load the voice embeddings dataset and select a specific speaker
embeddings_dataset = load_dataset(EMBEDDING_DATASET)
print(embeddings_dataset.keys())  # Output: ['validation']
speaker_embedding = torch.tensor(embeddings_dataset['validation'][7306]["xvector"]).unsqueeze(0)
# unsqueeze(0) adds a batch dimension required by the model

# Generate speech with custom parameters
speech = synthesiser(
    text_inputs="Hi to an artificial intelligence engineer on the way to mastery!",  # Input text
    forward_params={
        "speaker_embeddings": speaker_embedding  # Inject specific speaker characteristics
    }
)

# Save the generated audio to file
sf.write(
    file="speech.wav",
    data=speech["audio"],  # Audio data array
    samplerate=speech["sampling_rate"]  # Preserve original sample rate (usually 16kHz)
)

In [None]:
from IPython.display import Audio

# Play the generated audio
Audio("speech.wav")

## low resource intensive image generation model

In [None]:
import torch
import os
from diffusers import StableDiffusionPipeline
os.environ["XFORMERS_DISABLE_TRITON"] = "1"
model_id = "OFA-Sys/small-stable-diffusion-v0"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
pipe.enable_model_cpu_offload()

prompt = "A delicious burger"

image = pipe(prompt).images[0]

In [None]:
display(image)