In [None]:


!pip install git+https://github.com/coqui-ai/TTS.git

!pip install --upgrade pandas==2.2.2

!pip install transformers datasets accelerate librosa torchaudio wandb
!pip install onnxruntime

!pip uninstall -y numpy
!pip install numpy --upgrade --force-reinstall
!pip install --upgrade --force-reinstall soundfile datasets scipy

!pip install numpy==1.24.4

# Full Fix for NumPy Compatibility Issues
!pip uninstall -y numpy
!pip install numpy==1.26.4 --force-reinstall

!pip install torchaudio

"""**Inference & testing Each Model**

1.   **Kokoro-82M:** A lightweight 82-million parameter transformer model designed for efficient and fast inference on small devices or edge applications.
2. **CSM-1B:** A 1-billion parameter conversational AI model optimized for natural, context-aware dialogue and instruction following.
3. **XTTS-v2:** A multilingual, multi-speaker text-to-speech model capable of generating realistic speech in various languages and voices from text.
"""

!git clone https://github.com/hexgrad/kokoro.git
!cd kokoro

# 1️⃣ Install required packages
!pip install -q kokoro>=0.9.4 soundfile ipython
!apt-get -qq -y install espeak-ng > /dev/null 2>&1

# 2️⃣ Initialize the pipeline with enhanced configuration
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
from typing import Optional

class KokoroTTS:
    def __init__(self, lang_code: str = 'a', device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
        """
        Initialize Kokoro TTS pipeline with automatic device detection

        Args:
            lang_code: Language code ('a'=English, 'j'=Japanese, etc.)
            device: 'cuda' or 'cpu'
        """
        self.pipeline = KPipeline(lang_code=lang_code)
        self.device = device

    def synthesize(
        self,
        text: str,
        voice: str = 'af_heart',
        speed: float = 1.0,
        voice_tensor: Optional[torch.Tensor] = None,
        split_pattern: str = r'\n+',
        save_path: Optional[str] = None
    ) -> tuple:
        """
        Generate speech with voice cloning capabilities

        Args:
            text: Input text to synthesize
            voice: Predefined voice name
            speed: Playback speed (0.5-2.0)
            voice_tensor: Custom voice tensor for cloning
            split_pattern: Regex for text splitting
            save_path: Optional path to save audio

        Returns:
            tuple: (graphemes, phonemes, audio_array)
        """
        try:
            generator = self.pipeline(
                text,
                voice=voice_tensor if voice_tensor else voice,
                speed=speed,
                split_pattern=split_pattern
            )

            results = []
            for i, (gs, ps, audio) in enumerate(generator):
                print(f"Segment {i+1}:")
                print(f"Text: {gs}")
                print(f"Phonemes: {ps}")

                # Display in notebook
                display(Audio(data=audio, rate=24000, autoplay=i==0))

                # Save if path provided
                if save_path:
                    segment_path = f"{save_path}_{i}.wav" if save_path else None
                    sf.write(segment_path, audio, 24000)
                    print(f"Saved to {segment_path}")

                results.append((gs, ps, audio))

            return results

        except Exception as e:
            print(f"Error during synthesis: {str(e)}")
            return None

# 3️⃣ Example Usage
if __name__ == "__main__":
    # Initialize with American English
    tts = KokoroTTS(lang_code='a')

    # Sample text with multiple paragraphs
    sample_text = """
    Kokoro is an advanced text-to-speech system that combines efficiency with high-quality output.
    The model supports voice cloning and emotional inflection, making it suitable for various applications.

    With just 82 million parameters, it achieves performance comparable to larger models while being significantly faster.
    The Apache license allows for both research and commercial use.
    """

    # Generate with default voice
    print("\nGenerating with default voice...")
    default_results = tts.synthesize(
        text=sample_text,
        voice='af_heart',
        speed=1.1,
        save_path='default_voice'
    )

    # Voice cloning example (requires pre-loaded voice tensor)
    # voice_tensor = torch.load('custom_voice.pt', weights_only=True)
    # print("\nGenerating with cloned voice...")
    # cloned_results = tts.synthesize(
    #     text=sample_text,
    #     voice_tensor=voice_tensor,
    #     speed=0.9,
    #     save_path='cloned_voice'
    # )

!pip install transformers huggingface-hub

from huggingface_hub import login
# For regular Python scripts:
login()

# Commented out IPython magic to ensure Python compatibility.
!git clone git@github.com:SesameAILabs/csm.git
# %cd csm

!pip install -r requirements.txt

import sys
sys.path.append("/content/csm")  # Add repo to Python path

from generator import load_csm_1b
import torchaudio
import torch

# Set device (MPS for Apple Silicon, CUDA for NVIDIA, else CPU)
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

# Load model
generator = load_csm_1b(device=device)

# Generate speech
audio = generator.generate(
    text="Hello from Sesame!",
    speaker=0,  # Check repo for speaker IDs
    context=[],  # Optional context (if supported)
    max_audio_length_ms=10_000,  # 10 seconds
)

# Save and play audio
torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)

# Play in Colab
import IPython.display as ipd
ipd.Audio("audio.wav")

texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world.",
    "How much wood would a woodchuck chuck if a woodchuck could chuck wood?",
    "Sesame Street is a beloved children's television show.",
]

for i, text in enumerate(texts):
    print(f"Generating audio for: '{text}'")
    audio = generator.generate(
        text=text,
        speaker=0,  # Default speaker
        context=[],
        max_audio_length_ms=10_000,
    )
    torchaudio.save(f"audio_{i}.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)
    display(ipd.Audio(f"audio_{i}.wav"))

# Install TTS with pip
!pip install TTS

from TTS.api import TTS

# Initialize the TTS with explicit parameters
tts = TTS(model_name="xtts_v2")

# Run TTS with required parameters
tts.tts_to_file(
    text="Hello world! This is a test of the I+XTTS-v2 model.",
    file_path="output.wav",
    speaker_wav="/content/audio.wav",  # Path to a voice sample
    language="en"  # Required language code
)

Collecting git+https://github.com/coqui-ai/TTS.git
  Cloning https://github.com/coqui-ai/TTS.git to /tmp/pip-req-build-p2th7pg1
  Running command git clone --filter=blob:none --quiet https://github.com/coqui-ai/TTS.git /tmp/pip-req-build-p2th7pg1
  Resolved https://github.com/coqui-ai/TTS.git to commit dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-learn>=1.3.0 (from TTS==0.22.0)
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting anyascii>=0.3.0 (from TTS==0.22.0)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting mutagen==1.47.0 (from TTS==0.22.0)
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pysbd>=0.3.4 (from TTS==0.22.0)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
C

2025-07-27 11:53:07.332430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753617187.676334      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753617187.780620      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered




config.json: 0.00B [00:00, ?B/s]

  WeightNorm.apply(module, name, dim)


kokoro-v1_0.pth:   0%|          | 0.00/327M [00:00<?, ?B/s]


Generating with default voice...


af_heart.pt:   0%|          | 0.00/523k [00:00<?, ?B/s]

Segment 1:
Text: Kokoro is an advanced text-to-speech system that combines efficiency with high-quality output.
Phonemes: kəkˈɔɹO ɪz ɐn ədvˈænst tˈɛksttəspˈiʧ sˈɪstəm ðæt kəmbˈInz əfˈɪʃənsi wɪð hˌIkwˈɑləTi ˈWtpˌʊt.


Saved to default_voice_0.wav
Segment 2:
Text: The model supports voice cloning and emotional inflection, making it suitable for various applications.
Phonemes: ðə mˈɑdᵊl səpˈɔɹts vˈYs klˈOnɪŋ ænd əmˈOʃᵊnəl ɪnflˈɛkʃən, mˈAkɪŋ ɪt sˈuTəbᵊl fɔɹ vˈɛɹiəs ˌæpləkˈAʃənz.


Saved to default_voice_1.wav
Segment 3:
Text: With just 82 million parameters, it achieves performance comparable to larger models while being significantly faster.
Phonemes: wˌɪð ʤˈʌst ˈATi tˈu mˈɪljᵊn pəɹˈæməTəɹz, ɪt əʧˈivz pəɹfˈɔɹməns kˈɑmpəɹəbᵊl tə lˈɑɹʤəɹ mˈɑdᵊlz wˌIl bˈiɪŋ səɡnˈɪfəkəntli fˈæstəɹ.


Saved to default_voice_2.wav
Segment 4:
Text: The Apache license allows for both research and commercial use.
Phonemes: ði əpˈæʧi lˈIsᵊns əlˈWz fɔɹ bˈOθ ɹˈisˌɜɹʧ ænd kəmˈɜɹʃəl jˈus.


Saved to default_voice_3.wav


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Cloning into 'csm'...
The authenticity of host 'github.com (140.82.114.4)' can't be established.
ED25519 key fingerprint is SHA256:+DiY3wvvV6TuJJhbpZisF/zLDA0zPMSvHdkr4UvCOqU.
This key is not known by any other names
Are you sure you want to continue connecting (yes/no/[fingerprint])? 

In [3]:
import time

start = time.time()
# run inference
end = time.time()
print(f"Inference time: {end - start:.2f} seconds")


Inference time: 0.00 seconds


In [4]:
import torch
print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")


VRAM usage: 0.00 MB


In [9]:
import os

base_path = "/kaggle/input/librispeech-small-dataset"

for root, dirs, files in os.walk(base_path):
    for name in files:
        print(os.path.join( root, name))


/kaggle/input/librispeech-small-dataset/metadata.csv
/kaggle/input/librispeech-small-dataset/wavs/3092-39501-0100.wav
/kaggle/input/librispeech-small-dataset/wavs/5304-109507-0010.wav
/kaggle/input/librispeech-small-dataset/wavs/1638-84447-0023.wav
/kaggle/input/librispeech-small-dataset/wavs/5808-54425-0039.wav
/kaggle/input/librispeech-small-dataset/wavs/7398-98876-0020.wav
/kaggle/input/librispeech-small-dataset/wavs/1498-140041-0037.wav
/kaggle/input/librispeech-small-dataset/wavs/1777-142745-0007.wav
/kaggle/input/librispeech-small-dataset/wavs/23-124439-0060.wav
/kaggle/input/librispeech-small-dataset/wavs/4788-94904-0024.wav
/kaggle/input/librispeech-small-dataset/wavs/103-1241-0001.wav
/kaggle/input/librispeech-small-dataset/wavs/770-134592-0021.wav
/kaggle/input/librispeech-small-dataset/wavs/708-129393-0105.wav
/kaggle/input/librispeech-small-dataset/wavs/8312-279791-0052.wav
/kaggle/input/librispeech-small-dataset/wavs/3361-130131-0032.wav
/kaggle/input/librispeech-small-dat

In [10]:
import pandas as pd

df = pd.read_csv("/kaggle/input/librispeech-small-dataset/metadata.csv")
print(df.head())


                                          audio_path             filename  \
0  ../input/librispeech-asr-wav-dataset/train-cle...  100-121669-0019.wav   
1  ../input/librispeech-asr-wav-dataset/train-cle...  100-121669-0014.wav   
2  ../input/librispeech-asr-wav-dataset/train-cle...  100-121669-0013.wav   
3  ../input/librispeech-asr-wav-dataset/train-cle...  100-121674-0035.wav   
4  ../input/librispeech-asr-wav-dataset/train-cle...  100-121674-0017.wav   

            subset  speaker_id  chapter_id  file_id               id sex  \
0  train-clean-360         100      121669       19  100-121669-0019   F   
1  train-clean-360         100      121669       14  100-121669-0014   F   
2  train-clean-360         100      121669       13  100-121669-0013   F   
3  train-clean-360         100      121674       35  100-121674-0035   F   
4  train-clean-360         100      121674       17  100-121674-0017   F   

   minute speaker_name                                           sentence  
0   

In [12]:
import shutil

# Delete the processed output directory
shutil.rmtree("/kaggle/working/processed_data", ignore_errors=True)

print("✅ Cleanup complete: '/kaggle/working/processed_data' has been removed.")


✅ Cleanup complete: '/kaggle/working/processed_data' has been removed.


In [13]:
import os
import pandas as pd
import librosa
import soundfile as sf
from pathlib import Path
from tqdm import tqdm

# 📁 Paths
INPUT_DIR = "/kaggle/input/librispeech-small-dataset/wavs"
METADATA_PATH = "/kaggle/input/librispeech-small-dataset/metadata.csv"
OUTPUT_DIR = "/kaggle/working/processed_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ⚙️ Audio Settings
TARGET_SR = 16000
CLIP_MIN = 5  # Minimum segment duration (in seconds)
CLIP_MAX = 10  # Maximum segment duration
MAX_SAMPLES = 200  # Healthy number of files to start with

# 📖 Load metadata
df = pd.read_csv(METADATA_PATH)
print(f"✅ Total samples in dataset: {len(df)}")
df = df.head(MAX_SAMPLES)

# ✨ Track stats
accepted, skipped = 0, 0
train_entries = []

# 🛠 Audio preprocessing
def preprocess_audio(file_path, target_sr=16000):
    y, sr = librosa.load(file_path, sr=None)
    y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    y_trimmed, _ = librosa.effects.trim(y)
    y_normalized = librosa.util.normalize(y_trimmed)
    return y_normalized, target_sr

# 🔪 Split long files into 5–10s clips
def split_and_save(y, sr, base_name, transcript):
    global accepted, skipped
    clip_samples = sr * CLIP_MAX
    start = 0
    i = 0

    while start < len(y):
        end = start + clip_samples
        segment = y[start:end]
        duration = librosa.get_duration(y=segment, sr=sr)

        # Reject short clips
        if duration < CLIP_MIN:
            skipped += 1
            break

        audio_out = os.path.join(OUTPUT_DIR, f"{base_name}_{i}.wav")
        text_out = os.path.join(OUTPUT_DIR, f"{base_name}_{i}.txt")

        sf.write(audio_out, segment, sr)
        with open(text_out, "w") as f:
            f.write(transcript.strip())

        # For training CSV
        train_entries.append({
            "audio_path": audio_out,
            "text": transcript.strip()
        })

        accepted += 1
        i += 1
        start = end

# 🌀 Process
for idx, row in tqdm(df.iterrows(), total=len(df)):
    wav_file = os.path.join(INPUT_DIR, row['filename'])
    if not os.path.isfile(wav_file):
        continue
    try:
        y, sr = preprocess_audio(wav_file, target_sr=TARGET_SR)
        base_name = row['filename'].replace(".wav", "")
        transcript = row['sentence']
        split_and_save(y, sr, base_name, transcript)
    except Exception as e:
        print(f"❌ Error processing {wav_file}: {e}")
        continue

# 💾 Save metadata CSV for training
meta_df = pd.DataFrame(train_entries)
meta_df.to_csv(os.path.join(OUTPUT_DIR, "metadata.csv"), index=False)

print(f"✅ Done! Accepted clips: {accepted}, Skipped: {skipped}")
print("📂 Output directory: /kaggle/working/processed_data")


✅ Total samples in dataset: 45537


100%|██████████| 200/200 [00:04<00:00, 40.59it/s]

✅ Done! Accepted clips: 257, Skipped: 117
📂 Output directory: /kaggle/working/processed_data



