In [38]:
!pip install -q streamlit

In [48]:

import streamlit as st
import torch
import time
import re
import numpy as np
import tempfile
import soundfile as sf

from unsloth import FastModel
from transformers import TrainingArguments
from trl import SFTTrainer
import torchaudio.transforms as T
import sys
sys.path.append('--app--')
from sparktts.models.audio_tokenizer import BiCodecTokenizer
from sparktts.utils.audio import audio_volume_normalize

st.set_page_config(page_title="Voice Assistant", page_icon="🎤", layout="centered")

st.markdown("""
    <style>
    body {
        background-color: #0b0f19;
        color: white;
    }
    .stApp {
        background: linear-gradient(180deg, #0b0f19 0%, #111927 100%);
    }
    .main-box {
        background-color: #1c2230;
        padding: 2rem;
        border-radius: 12px;
        text-align: center;
    }
    </style>
""", unsafe_allow_html=True)

st.markdown("<h2 style='text-align: center; color: #c084fc;'>🎤 Knowlithic TTS Demo</h2>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center; color: gray;'>Convert your text into speech using Knowlithic</p>", unsafe_allow_html=True)
st.markdown('<div class="main-box">', unsafe_allow_html=True)

@st.cache_resource
def load_tts_model():
    max_seq_length = 2048
    audio_tokenizer = BiCodecTokenizer("mobeen0/tokenizer", "cuda")

    model, tokenizer = FastModel.from_pretrained(
    model_name = f"mobeen0/knowlithic-0.4",
    max_seq_length = 2048,
    dtype = torch.float32,
    full_finetuning = True,
    load_in_4bit = False,
    token = "--replace-with-your-token--",
    )
    FastModel.for_inference(model)
    return model, tokenizer, audio_tokenizer

model, tokenizer, audio_tokenizer = load_tts_model()

@torch.inference_mode()
def generate_speech_from_text(
    text: str,
    model,
    tokenizer,
    audio_tokenizer,
    temperature: float = 0.8,
    top_k: int = 50,
    top_p: float = 1.0,
    max_new_audio_tokens: int = 2048,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
) -> np.ndarray:
    prompt = "".join([
        "<|task_tts|>",
        "<|start_content|>",
        text,
        "<|end_content|>",
        "<|start_global_token|>"
    ])

    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_audio_tokens,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:]
    predicts_text = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=False)[0]

    semantic_matches = re.findall(r"<\|bicodec_semantic_(\d+)\|>", predicts_text)
    if not semantic_matches:
        return np.array([], dtype=np.float32)

    pred_semantic_ids = torch.tensor([int(token) for token in semantic_matches]).long().unsqueeze(0)

    global_matches = re.findall(r"<\|bicodec_global_(\d+)\|>", predicts_text)
    pred_global_ids = (
        torch.tensor([int(token) for token in global_matches]).long().unsqueeze(0)
        if global_matches else torch.zeros((1, 1), dtype=torch.long)
    ).unsqueeze(0)

    audio_tokenizer.device = device
    audio_tokenizer.model.to(device)
    wav_np = audio_tokenizer.detokenize(
        pred_global_ids.to(device).squeeze(0),
        pred_semantic_ids.to(device)
    )

    return wav_np


st.markdown("## 🔊 Text-to-Speech")

user_text = st.text_input("Type something...", "")

if st.button("Generate Audio"):
    if not user_text.strip():
        st.warning("Please enter text to synthesize.")
    else:
        with st.spinner("Synthesizing..."):
            try:
                generated_waveform = generate_speech_from_text(
                    user_text, model, tokenizer, audio_tokenizer
                )

                if generated_waveform.size == 0:
                    st.error("Failed to generate audio.")
                else:
                    sample_rate = audio_tokenizer.config.get("sample_rate", 16000)

                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
                        sf.write(tmpfile.name, generated_waveform, sample_rate)
                        st.success("Audio generated!")
                        st.audio(tmpfile.name, format="audio/wav")
            except Exception as e:
                st.error(f"An error occurred: {e}")

st.markdown('</div>', unsafe_allow_html=True)




Missing tensor: mel_transformer.spectrogram.window
Missing tensor: mel_transformer.mel_scale.fb
==((====))==  Unsloth 2025.5.6: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.




DeltaGenerator()

In [49]:
!npm install localtunnel


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
up to date, audited 23 packages in 696ms
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G[0K  run `npm fund` for details
[1G[0K⠴[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠴[1G[0K

In [59]:
!streamlit run /content/app.py &>/content/logs.txt & curl ipv4.icanhazip.com

35.240.201.102


In [60]:
!npx localtunnel --port 8501

[1G[0K⠙[1G[0Kyour url is: https://kind-cameras-create.loca.lt
/content/node_modules/localtunnel/bin/lt.js:81
    throw err;
    ^

Error: connection refused: localtunnel.me:24915 (check your firewall settings)
    at Socket.<anonymous> [90m(/content/[39mnode_modules/[4mlocaltunnel[24m/lib/TunnelCluster.js:52:11[90m)[39m
[90m    at Socket.emit (node:events:524:28)[39m
[90m    at emitErrorNT (node:internal/streams/destroy:169:8)[39m
[90m    at emitErrorCloseNT (node:internal/streams/destroy:128:3)[39m
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:82:21)[39m

Node.js v20.19.0
[1G[0K⠙[1G[0K