**PREPARE ENVIRONMENT**

In [None]:
# @title
!apt update && apt install -y espeak-ng
!git clone https://github.com/Isi-dev/Zonos.git
%cd Zonos
!pip install -e .
!pip install --no-build-isolation -e .[compile] # optional but needed to run the hybrid

**DOWNLOAD MODELS**

In [None]:
# @title
import torch
import torchaudio
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model
print("Loading model...")
model = Zonos.from_pretrained("Isi99999/Zonos-v0.1-transformer", device=device)
print("Model loaded!")

**UPLOAD 10 TO 30 SECONDS REFERENCE VOICE AUDIO** (optional)

In [None]:
# @title
import os
from google.colab import files

os.environ["LC_ALL"] = "C.UTF-8"
os.environ["LANG"] = "C.UTF-8"
os.makedirs("assets", exist_ok=True)

uploaded = files.upload()
for filename in uploaded.keys():
    new_path = "assets/reference.mp3"
    if os.path.exists(new_path):
        os.remove(new_path)
    os.rename(filename, new_path)  # Rename safely

print("Loading reference audio...")
wav, sampling_rate = torchaudio.load("assets/reference.mp3")
speaker = model.make_speaker_embedding(wav, sampling_rate)
print("Reference audio loaded!")

**ENTER TEXT, ADJUST SETTINGS & RUN**

In [None]:
text = " I am motivated by the simple yet profound joys of being alive—the taste of a good meal, the laughter of a friend, the beauty of a sunrise, and the endless pursuit of knowledge. Even if everything about me ceases when I die, my actions, words, and ideas can leave ripples in the world, affecting others in ways I may never fully grasp. " # @param {type:"string"}
seed = 421 # @param {"type":"number"}
use_default_speaker = True  # @param {type:"boolean"}
language = 'en-us' # @param ['af', 'am', 'an', 'ar', 'as', 'az', 'ba', 'bg', 'bn', 'bpy', 'bs', 'ca', 'cmn', 'cs', 'cy', 'da', 'de', 'el', 'en-029', 'en-gb', 'en-gb-scotland', 'en-gb-x-gbclan', 'en-gb-x-gbcwmd', 'en-gb-x-rp', 'en-us', 'eo', 'es', 'es-419', 'et', 'eu', 'fa', 'fa-latn', 'fi', 'fr-be', 'fr-ch', 'fr-fr', 'ga', 'gd', 'gn', 'grc', 'gu', 'hak', 'hi', 'hr', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'is', 'it', 'ja', 'jbo', 'ka', 'kk', 'kl', 'kn', 'ko', 'kok', 'ku', 'ky', 'la', 'lfn', 'lt', 'lv', 'mi', 'mk', 'ml', 'mr', 'ms', 'mt', 'my', 'nb', 'nci', 'ne', 'nl', 'om', 'or', 'pa', 'pap', 'pl', 'pt', 'pt-br', 'py', 'quc', 'ro', 'ru', 'ru-lv', 'sd', 'shn', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'tn', 'tr', 'tt', 'ur', 'uz', 'vi', 'vi-vn-x-central', 'vi-vn-x-south', 'yue']
happy = 0.3077 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
sad = 0.0256 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
disgust = 0.0256 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
fear = 0.0256 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
surprise = 0.0256 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
anger = 0.0256 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
other = 0.2564 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
neutral = 0.3077 # @param {type:"slider", min:0.0, max:1.0, step:0.05}
pitch = 20 # @param {type:"slider", min:0, max:400, step:1}
speed = 15 # @param {type:"slider", min:0.0, max:40.0, step:1.0}


total = happy + sad + disgust + fear + surprise + anger + other + neutral
if total > 0:
    happy = happy / total
    sad = sad / total
    disgust = disgust / total
    fear = fear / total
    surprise = surprise / total
    anger = anger / total
    other = other / total
    neutral = neutral / total

emotions = torch.tensor(list(map(float, [happy, sad, disgust, fear, surprise, anger, other, neutral])), device=device)

if use_default_speaker:
    print("Loading default audio...")
    wav, sampling_rate = torchaudio.load("assets/exampleaudio.mp3")
    speaker = model.make_speaker_embedding(wav, sampling_rate)
    print("Default audio loaded!")


def generate_speech2( text, seed = 421, language="en-us", emotion_tensor= torch.tensor(list(map(float, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])), device=device), pitch= 20, speed= 15):
    """Generate speech from text"""
    print(f"Generating: {text}")

    if seed >= 0:
            torch.manual_seed(seed)
    else:
        torch.random.seed()

    # Create conditioning
    cond_dict = make_cond_dict(
        text=text,
        language=language,
        speaker=speaker,
        emotion=emotion_tensor,
        pitch_std = pitch,
        speaking_rate=speed

    )
    conditioning = model.prepare_conditioning(cond_dict)

    # Generate audio
    codes = model.generate(conditioning)
    wavs = model.autoencoder.decode(codes).cpu()

    # Save and play
    filename = "output.wav"
    torchaudio.save(filename, wavs[0], model.autoencoder.sampling_rate)
    return filename

output_file = generate_speech2(text, seed = seed, language=language, emotion_tensor= emotions, pitch = pitch, speed = speed)
from IPython.display import Audio
Audio(output_file)