# datset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
import shutil
import subprocess
import soundfile as sf
import librosa
from sklearn.model_selection import train_test_split
import requests
import tarfile

In [3]:
def download_dataset(url: str, save_path: str, chunk_size: int = 8192) -> None:
    """
    Download a file from URL with progress bar

    Args:
        url: Download URL
        save_path: Path to save the downloaded file
        chunk_size: Size of chunks for downloading
    """
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Send GET request to the URL
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise exception for bad status codes

    # Get total file size
    total_size = int(response.headers.get('content-length', 0))

    # Download with progress bar
    with open(save_path, 'wb') as file, \
         tqdm(
            desc=f"Downloading {os.path.basename(save_path)}",
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
         ) as progress_bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            progress_bar.update(size)

def extract_tgz(tgz_path: str, extract_path: str) -> None:
    """
    Extract .tgz file

    Args:
        tgz_path: Path to the .tgz file
        extract_path: Path where to extract the contents
    """
    print(f"\nExtracting {os.path.basename(tgz_path)}...")
    with tarfile.open(tgz_path, 'r:gz') as tar:
        # Get total number of members for progress bar
        members = tar.getmembers()
        total = len(members)

        # Extract with progress bar
        for i, member in enumerate(members):
            tar.extract(member, path=extract_path)
            percent = (i + 1) / total * 100
            print(f"Progress: {percent:.1f}%", end='\r')
    print("\nExtraction completed!")

# Main execution
if __name__ == "__main__":
    # Configuration
    dataset_url ="https://indicvoices.ai4bharat.org/backend/download_dataset/v2_Sanskrit_train.tgz?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzA5MTAxNTcsImlhdCI6MTczMDczNzM1NywiZW1haWwiOiJhamVldGttaXNocmEyMDAyQGdtYWlsLmNvbSJ9.puApCu4kvIbCdepV5xBH88gyAxt1OOaKgZG3VYP0uEg"
    save_dir = "/content/drive/MyDrive/Dataset"
    tgz_filename = "v2_Sanskrit_train.tgz"

    # Full paths
    tgz_path = os.path.join(save_dir, tgz_filename)
    extract_path = os.path.join(save_dir, "extracted")

    try:
        # Download the dataset
        print("Starting download...")
        download_dataset(dataset_url, tgz_path)
        print("Download completed!")

        # Extract the dataset
        extract_tgz(tgz_path, extract_path)

    except Exception as e:
        print(f"An error occurred: {str(e)}")

Starting download...


Downloading v2_Sanskrit_train.tgz: 100%|██████████| 4.93G/4.93G [04:27<00:00, 19.8MiB/s]


Download completed!

Extracting v2_Sanskrit_train.tgz...
Progress: 100.0%
Extraction completed!


In [4]:
!git clone https://github.com/jaywalnut310/vits.git
%cd vits
!pip install -r requirements.txt
# Install other dependencies
!pip install pyngrok
!pip install soundfile
!pip install librosa

Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81 (from 1)[K
Receiving objects: 100% (81/81), 3.33 MiB | 17.78 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/vits
Collecting Cython==0.29.21 (from -r requirements.txt (line 1))
  Downloading Cython-0.29.21-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting librosa==0.8.0 (from -r requirements.txt (line 2))
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.3.1 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.3.1.tar.gz (38.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.18.5 (from -r 

In [None]:
rm -rf /content/dataset/out/wavs

In [5]:
# Define paths
json_dir = '/content/drive/MyDrive/Dataset/extracted/Sanskrit/rv1b/train'
wav_dir = '/content/drive/MyDrive/Dataset/extracted/Sanskrit/rv1b/train'
output_dir = '/content/dataset/out'
metadata_file = os.path.join(output_dir, 'metadata.csv')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, 'wavs'), exist_ok=True)

# Initialize a list to store metadata
metadata = []

# Process each JSON file
for json_file in tqdm(os.listdir(json_dir)):
    if json_file.endswith('.json'):
        with open(os.path.join(json_dir, json_file), 'r', encoding='utf-8') as f:
            data = json.load(f)
            base_audio_file = os.path.splitext(json_file)[0] + '.wav'
            for i, segment in enumerate(data['verbatim']):
                text = segment['text']
                start = segment['start']
                end = segment['end']
                speaker = data['speaker_id']
                segment_audio_file = f"{os.path.splitext(json_file)[0]}_{i}.wav"

                # Add entry to metadata
                metadata.append([segment_audio_file, text, speaker])

                # Extract the segment audio
                src_audio_path = os.path.join(wav_dir, base_audio_file)
                dst_audio_path = os.path.join(output_dir, 'wavs', segment_audio_file)
                # Use ffmpeg to extract the segment, with error handling
                try:
                    # Use subprocess.run for better error handling
                    subprocess.run(['ffmpeg', '-i', src_audio_path, '-ss', str(start), '-to', str(end), '-c', 'copy', dst_audio_path], check=True)
                except subprocess.CalledProcessError as e:
                    print(f"Error extracting audio for {segment_audio_file}: {e}")
                    continue  # Skip this segment if ffmpeg fails

# Save metadata to CSV
metadata_df = pd.DataFrame(metadata, columns=['id', 'text', 'speaker'])
metadata_df.to_csv(metadata_file, sep='|', index=False, header=False)

print("Dataset conversion completed.")

100%|██████████| 546/546 [03:04<00:00,  2.97it/s]

Dataset conversion completed.





In [6]:
# # Normalize audio files
# normalized_wavs_dir = os.path.join(output_dir, 'wavs_normalized')
# os.makedirs(normalized_wavs_dir, exist_ok=True)

# print("Normalizing audio files...")
# for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
#     filename = row['id']
#     filepath = os.path.join(output_dir, 'wavs', filename)
#     y, sr = librosa.load(filepath, sr=22050)
#     y = y / max(abs(y))
#     normalized_filepath = os.path.join(normalized_wavs_dir, filename)
#     sf.write(normalized_filepath, y, sr)

# Normalize audio files
normalized_wavs_dir = os.path.join(output_dir, 'wavs_normalized')
os.makedirs(normalized_wavs_dir, exist_ok=True)

print("Normalizing audio files...")
for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
    filename = row['id']
    filepath = os.path.join(output_dir, 'wavs', filename)

    # Check if the file exists before processing
    if not os.path.exists(filepath):
        print(f"Warning: File not found: {filepath}, skipping...")
        continue

    try:
        y, sr = librosa.load(filepath, sr=22050)
        y = y / max(abs(y))
        normalized_filepath = os.path.join(normalized_wavs_dir, filename)
        sf.write(normalized_filepath, y, sr)
    except (LibsndfileError, FileNotFoundError) as e:
        print(f"Error processing {filename}: {e}, skipping...")
        # Optionally, you could remove the problematic file:
        # os.remove(filepath)


Normalizing audio files...


100%|██████████| 1211/1211 [00:43<00:00, 27.66it/s]


In [7]:
# Update metadata paths
metadata_df['id'] = metadata_df['id'].apply(lambda x: os.path.join('wavs_normalized', x))
metadata_df.to_csv(metadata_file, sep='|', index=False, header=False)


In [8]:
# Split data
train_meta, temp_meta = train_test_split(metadata_df, test_size=0.2, random_state=42)
val_meta, test_meta = train_test_split(temp_meta, test_size=0.5, random_state=42)

# Save splits
train_meta.to_csv(os.path.join(output_dir, 'train_metadata.csv'), sep='|', index=False, header=False)
val_meta.to_csv(os.path.join(output_dir, 'val_metadata.csv'), sep='|', index=False, header=False)
test_meta.to_csv(os.path.join(output_dir, 'test_metadata.csv'), sep='|', index=False, header=False)

In [9]:
!pip install -U pip
!pip install TTS

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1
Collecting TTS
  Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl.metadata (21 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Colle

In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [2]:
output_path = "/content/dataset/out"


dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "", )
)

In [3]:
audio_config = VitsAudioConfig(
    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)

In [4]:
import pandas as pd
import re

# Load the metadata CSV file
metadata_df = pd.read_csv('/content/dataset/out/metadata.csv', sep='|', header=None, names=['id', 'text', 'speaker'])

# Remove English alphabets and other characters from the 'text' column
metadata_df['text'] = metadata_df['text'].astype(str).apply(lambda text: re.sub(r'[0-9a-zA-Z!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', text))

# Extract the cleaned 'text' column
text_data = metadata_df['text']

# Get all characters from the cleaned text data
all_characters = ''.join(text_data)

# Get unique characters
unique_characters = set(all_characters)

# Print the unique characters
print("Unique Characters (after removing English alphabets and others):")
print("".join(sorted(list(unique_characters))))

Unique Characters (after removing English alphabets and others):

 (),-.5:ABDEFIKLMNOPRSU[]_abcdefghiklmnoprstuvwxyँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसह़ऽािीुूृेैॉोौ्ज़फ़।‌


In [5]:
character_config = CharactersConfig(
    characters_class= "TTS.tts.models.vits.VitsCharacters",
    characters= "".join(sorted(list(unique_characters))),
    punctuations=" !,.?-",
    pad= "<PAD>",
    eos= "<EOS>",
    bos= "<BOS>",
    blank= "<BLNK>",
)

In [17]:
config = VitsConfig(
    audio=audio_config,
    characters=character_config,
    run_name="vits_mr",
    batch_size=16,
    eval_batch_size=4,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=0,
    epochs=1,
    text_cleaner="basic_cleaners",
    use_phonemes=False,
    phoneme_language="sa",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=False,
    save_best_after=20,
    save_checkpoints=True,
    save_all_best=True,
    mixed_precision=True,
    max_text_len=150,  # change this if you have a larger VRAM than 16GB
    output_path='/content/drive/MyDrive/out/check',
    datasets=[dataset_config],
    cudnn_benchmark=False,
    test_sentences=[['हरिओं नमस्ते अहम् अ पर्वतवर्धिनी इति  अ अहम् अ '],
                    ['अ भवन्तः  अ एत् वित्तकोशः  अस्ति तत्र नूतनम् एकम् अ अकाउण्ट अ उद्घाटयितुम् इच्छामि  अ तदर्थम् अहम् दुरवाणीम्  कृतवती   ']
    ]
)

In [7]:
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [8]:
%cd monotonic_align
# os.makedirs('/content/vits/monotonic_align/monotonic_align')
!python setup.py build_ext --inplace
%cd ..

[Errno 2] No such file or directory: 'monotonic_align'
/content
python3: can't open file '/content/setup.py': [Errno 2] No such file or directory
/


In [13]:
import os
import pandas as pd
import soundfile as sf

def formatter(root_path, manifest_file, **kwargs):
    """Assumes metadata.csv has columns: 'id', 'text', 'speaker' """
    csv_file = os.path.join(root_path, manifest_file)
    items = []
    df = pd.read_csv(csv_file, sep='|', header=None, names=['id', 'text', 'speaker'])

    for index, row in df.iterrows():
        wav_file = os.path.join (root_path, row['id'])
        try:
            # Attempt to load the audio file to check for corruption
            sf.read(wav_file)
            text = row['text']
            speaker_name = row['speaker']
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
        except Exception as e:  # Catch any exception during audio loading
            print(f"Skipping corrupted audio file: {wav_file}. Error: {e}")
            # If an exception occurs, ensure items is not empty
            if not items:  # Check if items is empty
                items = [{"text": "", "audio_file": "", "speaker_name": "", "root_path": root_path}]
            continue  # Skip to the next file

    return items

In [14]:
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
formatter=formatter)

 | > Found 1211 files in /content/dataset/out


In [30]:
pip install tensorboard



In [18]:
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 2
 | > Num. of Torch Threads: 1
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/drive/MyDrive/out/check/vits_mr-November-06-2024_08+03AM-0000000
  self.scaler = torch.cuda.amp.GradScaler()

 > Model has 82369484 parameters


In [33]:
!pip install numpy==1.24.4 # Replace 1.24.4 with the desired version
!pip install tensorboard==2.12.0  # Replace 2.12.0 with the desired version

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.0
    Uninstalling numpy-1.22.0:
      Successfully uninstalled numpy-1.22.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tts 0.22.0 requires numpy==1.22.0; python_version <= "3.10", but you have numpy 1.24.4 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
mizani 0.13.0 requires pandas>=2.2.0, but you have pandas 1.5.3 which is 

Collecting tensorboard==2.12.0
  Downloading tensorboard-2.12.0-py3-none-any.whl.metadata (1.8 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.12.0)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard-plugin-wit>=1.6.0 (from tensorboard==2.12.0)
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl.metadata (873 bytes)
Downloading tensorboard-2.12.0-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.3/781.3 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard-plugin-wit, google-auth-oauthlib, tensorboard
  Attempting uninstall: google-auth-oauthlib
    Found existing insta

In [19]:
trainer.fit()


[4m[1m > EPOCH: 0/1[0m
 --> /content/drive/MyDrive/out/check/vits_mr-November-06-2024_08+03AM-0000000

[1m > TRAINING (2024-11-06 08:03:14) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 1199
 | > Preprocessing samples
 | > Max text length: 150
 | > Min text length: 1
 | > Avg text length: 55.662921348314605
 | 
 | > Max audio length: 483350.0
 | > Min audio length: 5667.0
 | > Avg audio length: 129501.0149812734
 | > Num. instances discarded samples: 131
 | > Batch group size: 0.


  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion

[1m   --> TIME: 2024-11-06 08:03:16 -- STEP: 0/67 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 6.076951503753662  (6.076951503753662)
     | > loss_disc_real_0: 1.0085312128067017  (1.0085312128067017)
     | > loss_disc_real_1: 0.9940115809440613  (0.9940115809440613)
     | > loss_disc_real_2: 1.0333536863327026  (1.0333536863327026)
     | > loss_disc_real_3: 1.0469212532043457  (1.0469212532043457)
     | > loss_disc_real_4: 0.9976561665534973  (0.9976561665534973)
     | > loss_disc_real_5: 0.9956231117248535  (0.9956231117248535)
     | > loss_0: 6.076951503753662  (6.076951503753662)
     | > grad_norm_0: 0  (0)
     | > loss_gen: 6.076952934265137  (6.076952934265137)
     | > loss_kl: 205.46009826660156  (205.46009826660156)
     | > loss_feat: 0.42832493782043457  (0.42832493782043457)
     | > loss_mel: 206.094421386



> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 12
 | > Preprocessing samples
 | > Max text length: 148
 | > Min text length: 30
 | > Avg text length: 81.375
 | 
 | > Max audio length: 364566.0
 | > Min audio length: 83990.0
 | > Avg audio length: 219985.25
 | > Num. instances discarded samples: 4
 | > Batch group size: 0.


 ! Run is removed from /content/drive/MyDrive/out/check/vits_mr-November-06-2024_08+03AM-0000000
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1833, in fit
    self._fit()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1787, in _fit
    self.eval_epoch()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1643, in eval_epoch
    for cur_step, batch in enumerate(self.eval_loader):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 701, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1465, in _next_data
    return self._process_data(data)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1491, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 715, in reraise
    raise except

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1833, in fit
    self._fit()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1787, in _fit
    self.eval_epoch()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1643, in eval_epoch
    for cur_step, batch in enumerate(self.eval_loader):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 701, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1465, in _next_data
    return self._process_data(data)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1491, in _process_data
    data.reraise()
  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 715, in reraise
    raise exception
IndexError: Caught IndexError in DataLoader worker process 1.
Original Traceback (most recent

TypeError: object of type 'NoneType' has no len()

In [None]:
import torch
import os

def save_checkpoint(model, config, checkpoint_dir,  epoch, is_best_model=False):
    """Saves a model checkpoint."""

    os.makedirs(checkpoint_dir, exist_ok=True)  # Ensure the directory exists

    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_{epoch}.pth")
    torch.save(model.state_dict(), checkpoint_path) # Save only model weights.

    if is_best_model:
        best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
        torch.save(model.state_dict(), best_model_path)


# ... inside your training loop after each epoch or as needed ...

checkpoint_dir = config.output_path  # or wherever you want to save checkpoints

save_checkpoint(model, config, '/content/drive/MyDrive/out/check', 1)

In [None]:
import pandas as pd

dataset_name = "marathi"
origin_file_path = ["/content/dataset/out/metadata.csv"]

In [None]:
import pandas as pd

# Read and combine data from all files, handling potential quoting issues and specifying column names
data = pd.read_csv(
    "/content/dataset/out/metadata.csv",
    sep="|",
    header=None,
)
print("Number of lines:", len(data))
print(data.head())

Number of lines: 1336
                                       0                     1   
0  wavs_normalized/562949953542617_0.wav         कमल आनंद हिरे  \
1  wavs_normalized/562949953542617_1.wav      नितीन सुभाष गिरे   
2  wavs_normalized/562949953542617_2.wav  सत्तेन देवेन मेश्राम   
3  wavs_normalized/562949953542617_3.wav   सोनाली महेश परांजपे   
4  wavs_normalized/562949953542617_4.wav       नीता गणेश मुडे    

                   2  
0  S4259071700339560  
1  S4259071700339560  
2  S4259071700339560  
3  S4259071700339560  
4  S4259071700339560  


In [None]:
data = pd.concat(
    [data[2], data[1], data[0]],axis=1)
data.columns = ["spkid","text", "uttid"]
print(data.head())

               spkid                  text   
0  S4259071700339560         कमल आनंद हिरे  \
1  S4259071700339560      नितीन सुभाष गिरे   
2  S4259071700339560  सत्तेन देवेन मेश्राम   
3  S4259071700339560   सोनाली महेश परांजपे   
4  S4259071700339560       नीता गणेश मुडे    

                                   uttid  
0  wavs_normalized/562949953542617_0.wav  
1  wavs_normalized/562949953542617_1.wav  
2  wavs_normalized/562949953542617_2.wav  
3  wavs_normalized/562949953542617_3.wav  
4  wavs_normalized/562949953542617_4.wav  


In [None]:
spkid_to_idx = {spkid: idx for idx,
                spkid in enumerate(sorted(data.spkid.unique()))}
spkidx = data["spkid"].map(spkid_to_idx)

spkidx = pd.DataFrame(spkidx)
spkidx.columns = ["phonemes"]
data = pd.concat([spkidx, data], axis=1)
print(data.head())

   phonemes              spkid                  text   
0        94  S4259071700339560         कमल आनंद हिरे  \
1        94  S4259071700339560      नितीन सुभाष गिरे   
2        94  S4259071700339560  सत्तेन देवेन मेश्राम   
3        94  S4259071700339560   सोनाली महेश परांजपे   
4        94  S4259071700339560       नीता गणेश मुडे    

                                   uttid  
0  wavs_normalized/562949953542617_0.wav  
1  wavs_normalized/562949953542617_1.wav  
2  wavs_normalized/562949953542617_2.wav  
3  wavs_normalized/562949953542617_3.wav  
4  wavs_normalized/562949953542617_4.wav  


In [None]:
# Number of speakers
print("Number of speakers:", len(data.spkid.unique()))

Number of speakers: 172


In [None]:
data.to_csv(f"/content/vits/filelists/marathi.csv", sep="\t")

In [None]:
lang = "mr"

In [None]:
import pandas as pd

dataset_name = "marathi"
data = pd.read_csv(f"/content/vits/filelists/{dataset_name}.csv", sep="\t")
print(data.head())

   Unnamed: 0  phonemes              spkid                  text   
0           0        94  S4259071700339560         कमल आनंद हिरे  \
1           1        94  S4259071700339560      नितीन सुभाष गिरे   
2           2        94  S4259071700339560  सत्तेन देवेन मेश्राम   
3           3        94  S4259071700339560   सोनाली महेश परांजपे   
4           4        94  S4259071700339560       नीता गणेश मुडे    

                                   uttid  
0  wavs_normalized/562949953542617_0.wav  
1  wavs_normalized/562949953542617_1.wav  
2  wavs_normalized/562949953542617_2.wav  
3  wavs_normalized/562949953542617_3.wav  
4  wavs_normalized/562949953542617_4.wav  


In [None]:
pip install phonemizer

Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-1.2.1-py3-none-any.whl.metadata (1.1 kB)
Collecting clldutils>=1.7.3 (from segments->phonemizer)
  Downloading clldutils-3.24.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.5.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting colorlog (from clldutils>=1.7.3->segments->phonemizer)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting bibtexparser>=2.0.0b4 (from clldutils>=1.7.3->segments->phonemizer)
  Downloading bibtexparser-2.0.0b7-py3-none-any.whl.metadata (5.6 kB)
Collecting pylatexenc (from clldutils>=1.7.3->segments->phonemizer)
  Downloading pylatexenc-2.10.tar.gz (162 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting isodate (from

In [None]:
" ".join(data["text"],)

'कमल आनंद हिरे नितीन सुभाष गिरे सत्तेन देवेन मेश्राम सोनाली महेश परांजपे नीता गणेश मुडे  एखाद्याच्या एखाद्याचा अवयव फॅक्चर झाला आहे आणि आता काय करायचं त्याला काई चालता येत नाई काई नाई [breathing] तर तोपर्यंत तर दवाखाण्यात नेईपर्यंत ते लोक तर त्याला एखादा कपडा घेतील आणि त्यानी बांधून ठेवतील दवाखाण्यात जाईपर्यंत  [breathing] आणि दवाखाण्यात गेल्यावर तिथे गेल्यावर त्याच्यावर उपचार करतील  पहिले तपासतील काय झालं कुठं फॅक्चर झालं वगेरे वगेरे आणि मंग त्या फॅक्चरला ते प्लास्टर लावतील प्लास्टर हे [umm] काई दिवसांचे किंवा काई महिन्याचे असते  तर काई दिवस ते ठेवलं तर बोटांची हालचाल नाई होत [umm] काई करता येत नै प्लॅस्टर ओलेही [noise] होऊ द्याव नाही लागत. [breathing] मंग अशा प्रकारे प्लॅस्टरचे  [noise] हे प्रक्रिया जी असते ती [breathing] बरे होण्याची प्रक्रिया ती प्लॅस्टरमधून होत असते आणि काही दिवस तर काही महिने तर लागतच असतात  तर अशी प्रक्रिया आहे बरे होन्यास आणि उपचार घेतल्यास.  न्यूयॉर्क सिटी  बेर्लिन मेक्सिको सिटी संघाई बँकॉक हॅलो [inhaling] उमंग एजन्सीमधून बोलत आहात का  हो सर माजं नाव दीपा लिंग

In [None]:
# # %cd /content/vits

# from text.cleaners import marathi_cleaners

# phonemes = marathi_cleaners(" ".join(data["text"],))

# phonemes = pd.DataFrame(phonemes)
# phonemes.columns = ["phonemes"]
# data = pd.concat([data, phonemes], axis=1)
# print(data.head())

In [None]:
# Load the data from the csv file
import pandas as pd
import os
import random

random.seed(42)

dataset_name = "marathi"
data: pd.DataFrame = pd.read_csv(f"/content/vits/filelists/{dataset_name}.csv", sep = "\t")
print(data.head())

   Unnamed: 0  phonemes              spkid                  text   
0           0        94  S4259071700339560         कमल आनंद हिरे  \
1           1        94  S4259071700339560      नितीन सुभाष गिरे   
2           2        94  S4259071700339560  सत्तेन देवेन मेश्राम   
3           3        94  S4259071700339560   सोनाली महेश परांजपे   
4           4        94  S4259071700339560       नीता गणेश मुडे    

                                   uttid  
0  wavs_normalized/562949953542617_0.wav  
1  wavs_normalized/562949953542617_1.wav  
2  wavs_normalized/562949953542617_2.wav  
3  wavs_normalized/562949953542617_3.wav  
4  wavs_normalized/562949953542617_4.wav  


In [None]:
# Support for DataFrames
def split_file_list(orig_data: pd.DataFrame, train_ratio=None, test_samples=None, max_samples=None):
    # Shuffle the data
    data = orig_data.sample(frac=1).reset_index(drop=True)

    if max_samples is not None:
        data = data[:max_samples]

    if test_samples is not None:
        train_set = data[:-test_samples]
        test_set = data[-test_samples:]
    elif train_ratio is not None:
        train_set_size = int(len(data) * train_ratio)
        train_set = data[:train_set_size]
        test_set = data[train_set_size:]

    else:
        raise ValueError("Either 'train_ratio' or 'test_samples' should be provided.")

    return train_set, test_set


# Example usage
train_data, val_data = split_file_list(data, train_ratio=0.8)
val_data, test_data = split_file_list(val_data, train_ratio=0.5)

In [None]:
i_dir = "/content/dataset/out/"
o_file_train = f"/content/vits/filelists/{dataset_name}_train_filelist.txt"
o_file_val = f"/content/vits/filelists/{dataset_name}_val_filelist.txt"
o_file_test = f"/content/vits/filelists/{dataset_name}_test_filelist.txt"


In [None]:
import os
import pandas as pd
import wave

def create_path_map(source_dir):
    path_map = {}
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".wav"):
                path_map[file] = os.path.join(root, file)
    return path_map


def save_file_list(data, out_file_path, source_dir, path_map, link_name, cleaned_text=False):
    with open(out_file_path, "w") as file:
        for row in data.itertuples():
            uttid = os.path.basename(row.uttid)

            # Check if WAV file is valid
            try:
                with wave.open(path_map[uttid], 'rb') as wf:
                    # If it opens successfully, it's likely valid
                    pass
            except (wave.Error, EOFError, KeyError) as e:  # Added KeyError
                print(f"Skipping file: {path_map.get(uttid, uttid)} - Error: {e}")
                continue  # Skip to the next file

            path = path_map[uttid].replace(source_dir, link_name)
            spkidx = row.spkid
            info = row.text if not cleaned_text else row.phonemes  # Access 'text' or 'phonemes'

            file.write(f"{path}|{spkidx}|{info}\n")
            if row.Index % 5000 == 0:
                print(f"{row.Index}: {path}|{spkidx}|{info}")

    print(f"Saved to '{out_file_path}' ({len(data)} samples).")

def save_files(data, out_file_path, source_dir, path_map, link_name):
    save_file_list(data, out_file_path, source_dir, path_map, link_name)
    if "phonemes" in data.columns:
        out_file_path = out_file_path.replace(".txt", ".txt.cleaned")
        save_file_list(data, out_file_path, source_dir,
                       path_map, link_name, cleaned_text=True)

In [None]:
path_map = create_path_map(i_dir)
link_name = "PRIME"


save_files(train_data, o_file_train, i_dir, path_map, link_name)
save_files(val_data, o_file_val, i_dir, path_map, link_name)
save_files(test_data, o_file_test, i_dir, path_map, link_name)

0: PRIMEwavs_normalized/562949953635255_4.wav|S4258594100387228|कॅनडा 
Skipping file: 562949953500902_0.wav - Error: '562949953500902_0.wav'
Saved to '/content/vits/filelists/marathi_train_filelist.txt' (1068 samples).
0: PRIMEwavs_normalized/562949953635255_4.wav|S4258594100387228|62
Skipping file: 562949953500902_0.wav - Error: '562949953500902_0.wav'
Saved to '/content/vits/filelists/marathi_train_filelist.txt.cleaned' (1068 samples).
0: PRIMEwavs_normalized/c4b29e6f-b889-4383-8eb2-b0062c85f2e5_0_3.wav|S4259707900315768|हां
Saved to '/content/vits/filelists/marathi_val_filelist.txt' (134 samples).
0: PRIMEwavs_normalized/c4b29e6f-b889-4383-8eb2-b0062c85f2e5_0_3.wav|S4259707900315768|155
Saved to '/content/vits/filelists/marathi_val_filelist.txt.cleaned' (134 samples).
Saved to '/content/vits/filelists/marathi_test_filelist.txt' (134 samples).
Saved to '/content/vits/filelists/marathi_test_filelist.txt.cleaned' (134 samples).


In [None]:
# Create symlink to the dataset
!ln -s {i_dir} {link_name}

In [None]:
import numpy
import tensorflow
from torch.utils.tensorboard import SummaryWriter

print(numpy.__version__)
print(tensorflow.__version__)

writer = SummaryWriter()

1.26.4
2.17.0


In [None]:
import json

data = {
  "train": {
    "log_interval": 200,
    "eval_interval": 1000,
    "seed": 1234,
    "epochs": 10,
    "learning_rate": 2e-2,
    "betas": [0.8, 0.99],
    "eps": 1e-9,
    "batch_size": 64,
    "fp16_run": true,
    "lr_decay": 0.999875,
    "segment_size": 8192,
    "init_lr_ratio": 1,
    "warmup_epochs": 0,
    "c_mel": 45,
    "c_kl": 1.0
  },
  "data": {
    "training_files":"/content/vits/filelists/marathi_train_filelist.txt.cleaned",
    "validation_files":"/content/vits/filelists/marathi_val_filelist.txt.cleaned",
    "text_cleaners":["marathi_cleaner"],
    "max_wav_value": 32768.0,
    "sampling_rate": 22050,
    "filter_length": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": null,
    "add_blank": true,
    "n_speakers": 127,
    "cleaned_text": true
  },
  "model": {
    "inter_channels": 192,
    "hidden_channels": 192,
    "filter_channels": 768,
    "n_heads": 2,
    "n_layers": 6,
    "kernel_size": 3,
    "p_dropout": 0.1,
    "resblock": "1",
    "resblock_kernel_sizes": [3,7,11],
    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
    "upsample_rates": [8,8,2,2],
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [16,16,4,4],
    "n_layers_q": 3,
    "use_spectral_norm": false
  }
}


with open("/content/vits/configs/custom_base.json", "w") as f:
  json.dump(data, f, indent=2)  # Use indent for better formatting

print("custom_base.json created successfully!")

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!python train_ms.py -c configs/custom_base.json -m custom_base

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
import torch
import os

def save_checkpoint(model, config, checkpoint_dir,  epoch, is_best_model=False):
    """Saves a model checkpoint."""

    os.makedirs(checkpoint_dir, exist_ok=True)  # Ensure the directory exists

    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_{epoch}.pth")
    torch.save(model.state_dict(), checkpoint_path) # Save only model weights.

    if is_best_model:
        best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
        torch.save(model.state_dict(), best_model_path)


# ... inside your training loop after each epoch or as needed ...

checkpoint_dir = config.output_path  # or wherever you want to save checkpoints

save_checkpoint(model, config, '/content/drive/MyDrive/out/check', 1)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [None]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

In [None]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None)

In [None]:
stn_tst = get_text("We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.", hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))