# Importing Libraries

In [14]:
!pip install transformers -q
!pip install -U datasets -q
!pip install timm -q
!pip install phonemizer -q
!pip install soundfile -q
!pip install torch -q
!pip install num2words -q

In [15]:
from transformers import pipeline
from datasets import load_dataset
from IPython.display import Audio as IPythonAudio
import soundfile as sf
import torch
from num2words import num2words
import re

# Convert Numbers in Text to Words

In [16]:
def convert_numbers_to_words(text):
    def replace_number(match):
        num = int(match.group())
        return num2words(num, lang='ar')
    return re.sub(r'\b\d+\b', replace_number, text)

# Translation Pipeline

In [17]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar")

def translate_to_arabic(text_en):
    translated = translator(text_en, max_length=512)
    return translated[0]['translation_text']

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


# Text-to-Speech

In [18]:
def text_to_speech_arabic(text, filename="output.wav"):
    synthesiser = pipeline("text-to-speech", "MBZUAI/speecht5_tts_clartts_ar")
    embeddings_dataset = load_dataset("herwoww/arabic_xvector_embeddings", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[105]["speaker_embeddings"]).unsqueeze(0)
    
    # Number processing
    text = convert_numbers_to_words(text)

    # Divide the text into short pieces (to avoid losing words)
    max_chunk_len = 220
    chunks = []
    current_chunk = ""

    for sentence in re.split(r'(?<=[.!ØŸ])\s+', text):
        if len(current_chunk) + len(sentence) <= max_chunk_len:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())

    # Sound generation
    audio_data = []
    sampling_rate = 16000
    for chunk in chunks:
        speech = synthesiser(chunk, forward_params={"speaker_embeddings": speaker_embedding})
        audio_data.extend(speech["audio"])
        sampling_rate = speech["sampling_rate"]

    sf.write(filename, audio_data, samplerate=sampling_rate)
    return IPythonAudio(filename, autoplay=True)



In [23]:
text_en = """
I have two cats and 3 dogs and we all live in house near the river.
"""

# Translation
arabic_text = translate_to_arabic(text_en)

# Speech_arabic
display(text_to_speech_arabic(arabic_text))

Device set to use cuda:0
