In [None]:
!pip install torchaudio
!pip install -e .
!pip install datasets
!pip install huggingface_hub
!pip install wandb
!pip install cached_path

In [2]:
import os
import shutil
import sys
sys.path.append('/data_perm/F5-TTS_custom/src/')

from cached_path import cached_path
from f5_tts.model import CFM, UNetT, DiT, Trainer
from f5_tts.model.utils import get_tokenizer
from f5_tts.model.dataset import load_dataset
from importlib.resources import files
import csv
import json
import zipfile

  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.687 seconds.
Prefix dict has been built successfully.


Word segmentation module jieba initialized.



In [None]:
from huggingface_hub import hf_hub_download

local_dir = "/data_perm/"
filename = "all_mp3_data.zip"

hf_hub_download(repo_id="MonoraAI/test", filename=filename, local_dir=local_dir, repo_type="dataset")

In [3]:
!unzip -qq  /data_perm/all_mp3_data.zip -d /data_perm/

In [12]:
!python /data_perm/F5-TTS_custom/src/f5_tts/train/datasets/prepare_csv_wavs.py \
    /data_perm/all_mp3_data/ \
    /data_perm/F5-TTS_custom/data/turkish_data_char/

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.668 seconds.
Prefix dict has been built successfully.
Word segmentation module jieba initialized.


Saving to /data_perm/F5-TTS_custom/data/turkish_data_char ...
Writing to raw.arrow ...: 100%|█████████| 21905/21905 [00:09<00:00, 2400.35it/s]

For turkish_data_char, sample count: 21905
For turkish_data_char, vocab size is: 113
For turkish_data_char, total 94.01 hours


#### The above cell proces a .arrow type file. To view its content use the below cell:

In [13]:
from datasets import Dataset

# Path to the Arrow file
arrow_file_path = "/data_perm/F5-TTS_custom/data/turkish_data_char/raw.arrow"

# Load the dataset
dataset = Dataset.from_file(arrow_file_path)

# Print dataset info and a few samples
print(dataset)
print(dataset[1])  # View the first sample

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['audio_path', 'text', 'duration'],
    num_rows: 21905
})
{'audio_path': '/data_perm/all_mp3_data/wavs/CHP_sandıklara_sahip_çıkabilecek_mi_CHP_Ataşehir_Bld_Bşk_Adayı_Onursal_Adıgüzel__Fatih_Altaylı_0_1.mp3', 'text': ['S', 'o', 'h', 'b', 'e', 't', ' ', 'e', 't', 'm', 'e', ' ', 'v', 'e', ' ', 'y', 'a', 'p', 'm', 'a', 'y', 'ı', ' ', 'p', 'l', 'a', 'n', 'l', 'a', 'd', 'ı', ' ', 'k', 'l', 'a', 'r', 'ı', 'n', 'ı', ' ', 'ö', 'ğ', 'r', 'e', 'n', 'm', 'e', ' ', 'a', 'm', 'a', 'ç', 'l', 'ı', '.', ' ', 'A', 'm', 'a', ' ', 't', 'a', 'b', 'i', 'i', ' ', 'g', 'ö', 'r', 'e', 'v', ' ', 'b', 'a', 'ş', 'ı', ' ', 'n', 'd', 'a', 'k', 'i', ' ', 'b', 'e', 'l', 'e', 'd', 'i', 'y', 'e', ' ', 'b', 'a', 'ş', 'k', 'a', 'n', 'l', 'a', 'r', 'ı', ' ', 'y', 'l', 'a', ' ', 'a', 'd', 'a', 'y', 'l', 'a', 'r', ' ', 'a', 'r', 'a', 's', 'ı', ' ', 'n', 'd', 'a', ' ', 'd', 'a', ' ', 'e', 'l', 'b', 'e', 't', 't', 'e', ' ', 'b', 'i', 'r', ' ', 'a', 'y', 'r', 'ı', 'm', ' ', 'o', 'l', 'u', 'y', 'o',

## setup wandb

In [3]:
wandb_project = "F5_TTS_Turkish2"
wandb_run_name = "F5_TTS_Turkish_Run"
wandb_resume_id = None
import wandb
os.environ["WANDB_API_KEY"] = "a4058b2c4cf4220c074494541c001ce879794d05"
print(wandb.api.api_key)

a4058b2c4cf4220c074494541c001ce879794d05


### Training settings

In [4]:
# -------------------------- Dataset Settings --------------------------- #
target_sample_rate = 24000
n_mel_channels = 100
hop_length = 256
win_length = 1024
n_fft = 1024
mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'

# -------------------------- Argument Variables ------------------------- #
dataset_name = "turkish_data"
learning_rate = 1e-5
batch_size_per_gpu = 6400
batch_size_type = "frame"
max_samples = 64
grad_accumulation_steps = 1
max_grad_norm = 1.0
epochs = 20
num_warmup_updates = 300
save_per_updates = 2000
keep_last_n_checkpoints = 4
last_per_updates = 2000
finetune = True
pretrain = None
tokenizer = "char"
tokenizer_path = None
log_samples = False
logger = "wandb"
bnb_optimizer = False

In [5]:
checkpoint_path = "/data_perm/F5-TTS_custom/ckpts/turkish/"

In [6]:
# -------------------------- Training Settings -------------------------- #
# Model parameters based on experiment name

model_cls = DiT
model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)

vocab_char_map, vocab_size = get_tokenizer("/data_perm/F5-TTS_custom/ckpts/turkish/vocab.txt", "custom")

print("\nvocab :", vocab_size)
print("\nvocoder :", mel_spec_type)


vocab : 2554

vocoder : vocos


In [7]:
mel_spec_kwargs = dict(
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    n_mel_channels=n_mel_channels,
    target_sample_rate=target_sample_rate,
    mel_spec_type=mel_spec_type,
)

model = CFM(
    transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
    mel_spec_kwargs=mel_spec_kwargs,
    vocab_char_map=vocab_char_map,
)

trainer = Trainer(
    model,
    epochs,
    learning_rate,
    num_warmup_updates=num_warmup_updates,
    save_per_updates=save_per_updates,
    keep_last_n_checkpoints=keep_last_n_checkpoints,
    checkpoint_path=checkpoint_path,
    batch_size=batch_size_per_gpu,
    batch_size_type=batch_size_type,
    max_samples=max_samples,
    grad_accumulation_steps=grad_accumulation_steps,
    max_grad_norm=max_grad_norm,
    logger="wandb",
    wandb_project= wandb_project,
    wandb_run_name= wandb_run_name,
    wandb_resume_id=wandb_resume_id,
    log_samples=log_samples,
    last_per_updates=last_per_updates,
    bnb_optimizer=bnb_optimizer,
)

train_dataset = load_dataset(dataset_name, tokenizer, dataset_type="CustomDataset", mel_spec_kwargs=mel_spec_kwargs)


[34m[1mwandb[0m: Currently logged in as: [33mmonoraai[0m ([33mmonoraai-monora[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Using logger: wandb
Loading dataset ...


In [1]:
trainer.train(train_dataset)

Sorting with sampler... if slow, check whether dataset is provided with duration: 100%|██████████| 21905/21905 [00:00<00:00, 1591840.00it/s]
Creating dynamic batches with 6400 audio frames per gpu: 100%|██████████| 21905/21905 [00:00<00:00, 1919125.81it/s]


Saved last checkpoint at update 4000
