In [2]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting tzdata>=2022.1 (from pandas->datasets)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Using ca

In [21]:
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
from datasets import load_dataset
from IPython.display import Audio, display

def load_speech_model(checkpoint="bilalfaye/speecht5_tts-wolof-v0.2", vocoder_checkpoint="microsoft/speecht5_hifigan"):
    """ Load the SpeechT5 model, processor, and vocoder for text-to-speech. """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    processor = SpeechT5Processor.from_pretrained(checkpoint)
    model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint).to(device)
    vocoder = SpeechT5HifiGan.from_pretrained(vocoder_checkpoint).to(device)

    return processor, model, vocoder, device

# Load the model
processor, model, vocoder, device = load_speech_model()
# Load speaker embeddings (pretrained from CMU Arctic dataset)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def generate_speech_from_text(text, speaker_embedding=speaker_embedding, processor=processor, model=model, vocoder=vocoder):
    """ Generates speech from input text using SpeechT5 and HiFi-GAN vocoder. """

    inputs = processor(text=text, return_tensors="pt", padding=True, truncation=True, max_length=model.config.max_text_positions)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    speech = model.generate(
        inputs["input_ids"],
        speaker_embeddings=speaker_embedding.to(model.device),
        vocoder=vocoder,
        num_beams=7,
        temperature=0.6,
        no_repeat_ngram_size=3,
        repetition_penalty=1.5,
    )

    speech = speech.detach().cpu().numpy()
    display(Audio(speech, rate=16000))

# Example usage French
text = "Bonjour, bienvenue dans le modèle de synthèse vocale Wolof et Français."
generate_speech_from_text(text)

# Example usage Wolof
text = "ñu ne ñoom ñooy nattukaay satélite yi"
generate_speech_from_text(text)

In [4]:
import librosa
import IPython.display as ipd
text = "ñu ne ñoom ñooy nattukaay satélite yi"
aud = generate_speech_from_text(text)

In [6]:
!pip install librosa

Collecting librosa
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting scipy>=1.6.0 (from librosa)
  Downloading scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl.metadata (53 kB)
Collecting scikit-learn>=1.1.0 (from librosa)
  Downloading scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.0 (from librosa)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pooch>=1.1 (from librosa)
  Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp38-cp38-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.1.tar.gz (173 kB)
  Installing build dependencies ... [?25ld

In [17]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model_load_name = 'bilalfaye/nllb-200-distilled-600M-wolof-french'

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_load_name).to(device)
tokenizer = NllbTokenizer.from_pretrained(model_load_name)

def translate(
    text, src_lang='wol_Latn', tgt_lang='french_Latn',
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True,
        max_length=max_input_length
    )
    model.eval()
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

# Example usage
print(translate("Ndax mën nga ko waxaat su la neexee?", src_lang="wol_Latn", tgt_lang="french_Latn")[0])
print(translate("Bonjour, où allez-vous?", src_lang="fra_Latn", tgt_lang="wol_Latn")[0])


Mën nga ko waxtaanaat su ko bëggee?
Nanga def, fan ngeen jëm?


In [18]:
print(translate("Selon de nombreux experts et fans de football, Sadio Mané est considéré comme l'un des meilleurs joueurs sénégalais de tous les temps. Il a remporté de nombreux trophées avec Liverpool, notamment la Ligue des Champions en 2019, et a été élu meilleur joueur africain de l'année en 2019 et 2020.Cependant, il est important de noter que d'autres joueurs sénégalais ont également eu des carrières exceptionnelles, tels que El Hadji Diouf, Khalilou Fadiga ou encore Henri Camara.Mais si je devais donner un nom, je dirais que Sadio Mané est actuellement considéré comme le meilleur joueur du Sénégal.", src_lang="fra_Latn", tgt_lang="wol_Latn")[0])

Saajo Maane ñu ngi ko jàppee ni kenn ci futbalkati Senegaal yi gën a xarañ ci àdduna, ndax jëloon na ay raaya yu bare ak Liverpool ak Senegaal, rawatina Ligg dee sàmpiyoŋ ak Kubu réewi Afrig.


In [25]:
generate_speech_from_text("Saajo Maane jàpp nañu ne kenn ci futbalkat yi gën a xarañ ci Senegaal.")

In [23]:
generate_speech_from_text("Waaye su ma waroon a joxe tur, dama doon wax ne Saajo Maane moo gën a xarañ ci futbal bu Senegaal.")