In [None]:
## 1. Google Colab Setup

These cells help you set up Google Colab for efficient training:
- Mount Google Drive to save checkpoints
- Keep Colab from disconnecting
- Upload project files as a ZIP


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create directory for checkpoints
import os
checkpoint_dir = '/content/drive/MyDrive/VITS_Azerbaijani_Checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"Checkpoints will be saved to: {checkpoint_dir}")


In [None]:
# Keep Colab from disconnecting (click repeatedly)
from IPython.display import display, Javascript
import time

def keep_alive():
    display(Javascript('''
        function ClickConnect(){
            console.log("Clicking the connect button");
            document.querySelector("colab-connect-button").click()
        }
        setInterval(ClickConnect, 60000)
    '''))

print("Keep-alive activated. Colab will click connect every 60 seconds.")
keep_alive()


In [None]:
# Upload project files as a ZIP
from google.colab import files
import shutil

def upload_and_extract_project():
    print("Please upload the project ZIP file...")
    uploaded = files.upload()
    
    for filename in uploaded.keys():
        if filename.endswith('.zip'):
            print(f"Extracting {filename}...")
            # Extract to current directory
            !unzip -o "{filename}"
            print(f"Extracted {filename} successfully!")
        else:
            print(f"Skipping {filename} - not a ZIP file")
    
    # Check for main project files
    if os.path.exists('train.py'):
        print("✅ Project files extracted successfully!")
        !ls -la
    else:
        print("❌ Project files not found. Ensure your ZIP has the project files in its root.")

# Execute the function
upload_and_extract_project()


# VITS Azerbaijani Text-to-Speech Tutorial

This unified notebook walks you through an **end-to-end** workflow for training and using a VITS model for Azerbaijani TTS, including *zero-shot voice cloning* and an interactive **Gradio** demo.

**Sections**
1. Google Colab Setup (Drive mount, upload project)
2. Installation & Environment setup
3. Configuration overview
4. Dataset preparation & filelists
5. Audio preprocessing & normalization
6. Model training with checkpoint saving
7. Inference examples (TTS & voice cloning)
8. Web demo with Gradio

> ⚠️ Designed for Google Colab (GPU) but works locally with minor tweaks.

## 2. Installation & Environment setup

In [None]:
# System packages (phonemizer backend)
!apt-get update -y && apt-get install -y espeak ffmpeg

In [None]:
# Python packages
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q numpy scipy librosa unidecode tensorboard phonemizer webdataset gradio tqdm pydub

## 3. Configuration overview

The project ships with two JSON configs in `config/`:
- **`base_vits.json`** – main training hyper-parameters  
- **`hifigan.json`** – HiFiGAN vocoder settings (used internally by VITS)

Feel free to tweak, e.g. `batch_size`, `learning_rate`, or enable multi-speaker training.

In [None]:
import json, pprint, pathlib
cfg_path = pathlib.Path('config/base_vits.json')
with cfg_path.open() as f:
    hps = json.load(f)
pprint.pp(hps['train'])

## 4. Dataset preparation & filelists

In [None]:
# Optional: Upload dataset ZIP if needed
from google.colab import files
import os

def upload_and_extract_dataset():
    print("Please upload your dataset ZIP file...")
    uploaded = files.upload()
    
    for filename in uploaded.keys():
        if filename.endswith('.zip'):
            print(f"Extracting {filename} to datasets/...")
            os.makedirs('datasets', exist_ok=True)
            !unzip -o "{filename}" -d datasets/
            print(f"Dataset extracted successfully!")
            !ls -la datasets/
        else:
            print(f"Skipping {filename} - not a ZIP file")

# Uncomment and run if you need to upload a dataset
# upload_and_extract_dataset()


Each line in the filelists must follow:
```text
path/to/audio.wav|Azerbaijani transcript
```
Use the helper script below to auto-generate `train.txt` and `val.txt`.

In [None]:
## 5. Audio preprocessing & normalization

It's important to normalize audio before training to ensure consistent volume levels and remove any DC offset. Let's add a preprocessing step for that.


In [None]:
import os
import glob
from pydub import AudioSegment
import librosa
import soundfile as sf
import numpy as np
from tqdm.notebook import tqdm
import multiprocessing

def process_audio_file(file_path, target_sr=22050, target_level=-23.0, output_dir=None):
    """Normalize audio file to target level and resample to target sample rate."""
    try:
        # Determine output path
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            filename = os.path.basename(file_path)
            output_path = os.path.join(output_dir, filename)
        else:
            output_path = file_path
            
        # Load audio
        y, sr = librosa.load(file_path, sr=None)
        
        # Resample if needed
        if sr != target_sr:
            y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
            sr = target_sr
        
        # Remove DC offset
        y = y - np.mean(y)
        
        # Normalize audio level (RMS)
        rms = np.sqrt(np.mean(y**2))
        target_rms = 10**(target_level/20)
        gain = target_rms / (rms + 1e-8)
        y_normalized = y * gain
        
        # Apply slight compression to prevent clipping
        max_val = np.max(np.abs(y_normalized))
        if max_val > 0.99:
            y_normalized = y_normalized / max_val * 0.99
        
        # Save the processed file
        sf.write(output_path, y_normalized, sr)
        return True
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

def normalize_dataset(dataset_dir, output_dir=None):
    """Normalize all WAV files in a directory."""
    wav_files = glob.glob(os.path.join(dataset_dir, "**", "*.wav"), recursive=True)
    print(f"Found {len(wav_files)} WAV files to process")
    
    if not wav_files:
        print("No WAV files found!")
        return
    
    # Process files with progress bar
    with multiprocessing.Pool(processes=os.cpu_count()) as pool:
        args = [(f, 22050, -23.0, output_dir) for f in wav_files]
        results = list(tqdm(pool.starmap(process_audio_file, args), total=len(args)))
    
    success_count = results.count(True)
    print(f"Successfully processed {success_count} of {len(wav_files)} files")

# Run normalization on your dataset
# Uncomment and run when needed:
# normalize_dataset('datasets', output_dir='datasets_normalized')


In [None]:
!python data/tools/prepare_filelist.py \
    --wavs datasets \
    --output data/filelists \
    --val-ratio 0.05

## 6. Model training with checkpoint saving

In [None]:
### Training Progress Monitor

Run this cell in a separate tab to monitor training progress and ensure your drive is properly saving checkpoints.


In [None]:
import time
import os
import glob
import sys

# Paths to monitor - always use local checkpoints directory
checkpoint_dir = 'checkpoints'

# Monitor function
def monitor_training(interval=60):
    try:
        while True:
            # Check for checkpoint files
            checkpoint_files = glob.glob(f"{checkpoint_dir}/*.pt")
            
            # Print status
            print(f"\n=== Training Status: {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
            print(f"Found {len(checkpoint_files)} checkpoint files in {checkpoint_dir}")
            
            if checkpoint_files:
                # Sort by modification time (newest first)
                checkpoint_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
                
                # Show most recent checkpoints
                print("\nMost recent checkpoints:")
                for i, ckpt in enumerate(checkpoint_files[:3]):
                    mod_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(ckpt)))
                    size_mb = os.path.getsize(ckpt) / (1024 * 1024)
                    print(f"{i+1}. {os.path.basename(ckpt)} - {size_mb:.2f} MB - Last modified: {mod_time}")
            else:
                print("No checkpoints found yet. Training may not have saved a checkpoint.")
                
            # Wait for next check
            print(f"\nNext check in {interval} seconds...")
            time.sleep(interval)
    except KeyboardInterrupt:
        print("\nMonitoring stopped")

# Run the monitor (uncomment to start)
# monitor_training(interval=60)


In [None]:
# Set up backup for important checkpoints if on Colab
import os, sys
if 'google.colab' in sys.modules:
    # Define paths
    local_checkpoint_dir = 'checkpoints'
    drive_backup_dir = '/content/drive/MyDrive/VITS_Azerbaijani_Checkpoints'
    
    # Make sure both directories exist
    os.makedirs(local_checkpoint_dir, exist_ok=True)
    os.makedirs(drive_backup_dir, exist_ok=True)
    
    print(f"Training will use local checkpoints folder. Best model will be backed up to Google Drive.")
    
    # Create a script to back up important checkpoints
    with open('backup_checkpoints.py', 'w') as f:
        f.write("""
import os, shutil, time
import glob

def backup_important_checkpoints():
    src_dir = 'checkpoints'
    dst_dir = '/content/drive/MyDrive/VITS_Azerbaijani_Checkpoints'
    
    # Check if best model exists and back it up
    best_model = os.path.join(src_dir, 'best.pt')
    if os.path.exists(best_model):
        shutil.copy2(best_model, os.path.join(dst_dir, 'best.pt'))
        print(f"Backed up best model to {dst_dir}")
    
    # Also back up latest checkpoint
    checkpoints = glob.glob(f"{src_dir}/*.pt")
    if checkpoints:
        latest = max(checkpoints, key=os.path.getmtime)
        if not latest.endswith('best.pt'):
            latest_name = os.path.basename(latest)
            shutil.copy2(latest, os.path.join(dst_dir, latest_name))
            print(f"Backed up latest checkpoint {latest_name} to {dst_dir}")

if __name__ == "__main__":
    while True:
        backup_important_checkpoints()
        time.sleep(300)  # Check every 5 minutes
""")
    
    # Start backup script in background
    !nohup python backup_checkpoints.py > backup_log.txt 2>&1 &
    print("Automatic checkpoint backup running in background")

# Run training with local checkpoint directory
!python train.py \
  --config config/base_vits.json \
  --batch_size 16 \
  --epochs 1000 \
  --checkpoint_dir checkpoints \
  --log_dir logs \
  --save_every 10 \
  --keep_last 3

## 7. Inference examples

In [None]:
import torch, IPython.display as ipd
from model.vits import VITSInference  # helper class provided in repo
import glob
import os, sys

# Always use the local checkpoints directory
checkpoint_dir = 'checkpoints'

# Find the best or latest checkpoint
checkpoint_files = glob.glob(f"{checkpoint_dir}/*.pt")
if not checkpoint_files:
    print(f"No checkpoint files found in {checkpoint_dir}!")
    checkpoint_path = None
else:
    # Prefer best.pt if it exists, otherwise use latest
    if os.path.exists(os.path.join(checkpoint_dir, 'best.pt')):
        checkpoint_path = os.path.join(checkpoint_dir, 'best.pt')
    else:
        # Get the most recent checkpoint
        checkpoint_path = max(checkpoint_files, key=os.path.getmtime)
    
    print(f"Using checkpoint: {os.path.basename(checkpoint_path)}")

    # Initialize the model
    tts = VITSInference(
        checkpoint=checkpoint_path,
        config='config/base_vits.json')

    # Basic synthesis
    audio = tts.synthesize('Salam dünya! Bu VITS nümunəsidir.')
    ipd.display(ipd.Audio(audio, rate=22050))

### Voice cloning (zero-shot)

In [None]:
# Upload a reference voice file
reference_wav = None
if 'google.colab' in sys.modules:
    from google.colab import files
    print("Upload a reference voice file (.wav):")
    uploaded = files.upload()
    
    if uploaded:
        reference_wav = list(uploaded.keys())[0]
        print(f"Using uploaded file: {reference_wav}")
    else:
        # Use example file
        reference_wav = 'datasets/02.wav'
        print(f"Using default file: {reference_wav}")
else:
    # Use example file
    reference_wav = 'datasets/02.wav'

# Try voice cloning if the model and reference file are available
if 'tts' in locals() and reference_wav and os.path.exists(reference_wav):
    cloned = tts.synthesize(
        'Mənim səsimlə danışan süni zəka!',
        speaker_ref=reference_wav)
    ipd.display(ipd.Audio(cloned, rate=22050))

## 8. Gradio demo

In [None]:
# If running on Colab, make sure to use a public URL
if 'google.colab' in sys.modules:
    !python app.py --share   # launches public URL
else:
    !python app.py   # launches on http://<your_ip>:7860