In [None]:
from huggingface_hub import notebook_login
import torch
from transformers import VitsModel, AutoTokenizer
from tqdm import tqdm
import soundfile as sf
import json
import os
import gc

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

os.environ['HF_TOKEN'] = HF_TOKEN

## Bemba synthesis

In [None]:
model = VitsModel.from_pretrained("facebook/mms-tts-bem")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-bem")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
!head -5 bem_en.txt

In [None]:
base_path = "bem_en"
audio_path = f"{base_path}/audio"
dataset_json = f"{base_path}/bem_en.json"
os.makedirs(audio_path, exist_ok=True)
os.system(f"cp bem_en.txt {base_path}")
sample_rate = 16000
dataset = []

with open("bem_en.txt", encoding="utf-8") as f:
    text = f.readlines()
    for i, line in tqdm(enumerate(text), total=len(text)):
        bem_sentence, en_sentence = line.split(" || ")
        bem_sentence = bem_sentence.strip()
        en_sentence = en_sentence.strip()

        inputs = tokenizer(bem_sentence, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model(**inputs).waveform

        audio_numpy = output[0].cpu().numpy()

        # Save with the correct 16kHz sampling rate
        num_samples = len(audio_numpy)
        # duration in seconds, rounded to milliseconds
        duration = round(num_samples / sample_rate, 3)
        audio_filename = f"bem_{i}.wav"
        filename = f"{audio_path}/{audio_filename}"
        sf.write(filename, audio_numpy, sample_rate)

        # Add entry to dataset dictionary
        entry = {
            "audio": audio_filename,
            "duration_sec": duration,
            "bem_transcript": bem_sentence,
            "en_translation": en_sentence
        }

        dataset.append(entry)

# Dump all data to JSON file
with open(dataset_json, 'w', encoding='utf-8') as json_file:
    json.dump(dataset, json_file, indent=4)

In [None]:
torch.cuda.empty_cache()
gc.collect()

torch.cuda.empty_cache()
gc.collect()

In [None]:
model = VitsModel.from_pretrained("facebook/mms-tts-fon")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-fon")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
base_path = "fon_fr"
audio_path = f"{base_path}/audio"
dataset_json = f"{base_path}/fon_fr.json"
os.makedirs(audio_path, exist_ok=True)
os.system(f"cp fon_fr.txt {base_path}")
sample_rate = 16000
dataset = []

with open("fon_fr.txt", encoding="utf-8") as f:
    text = f.readlines()
    for i, line in tqdm(enumerate(text), total=len(text)):
        fon_sentence, fr_sentence = line.split(" || ")
        fon_sentence = fon_sentence.strip()
        fr_sentence = fr_sentence.strip()

        inputs = tokenizer(fon_sentence, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model(**inputs).waveform

        audio_numpy = output[0].cpu().numpy()

        # Save with the correct 16kHz sampling rate
        num_samples = len(audio_numpy)
        # duration in seconds, rounded to milliseconds
        duration = round(num_samples / sample_rate, 3)
        audio_filename = f"fon_{i}.wav"
        filename = f"{audio_path}/{audio_filename}"
        sf.write(filename, audio_numpy, sample_rate)

        # Add entry to dataset dictionary
        entry = {
            "audio": audio_filename,
            "duration_sec": duration,
            "fon_transcript": fon_sentence,
            "fr_translation": fr_sentence
        }

        dataset.append(entry)

# Dump all data to JSON file
with open(dataset_json, 'w', encoding='utf-8') as json_file:
    json.dump(dataset, json_file, indent=4)