In [None]:
#Example pipeline

import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
import pygame

# Step 1: Speech Recognition
recognizer = sr.Recognizer()
with sr.Microphone() as source:
    print("Speak something...")
    audio = recognizer.listen(source)
    source_text = recognizer.recognize_google(audio)

# Step 2: Translation
translator = Translator()
target_text = translator.translate(source_text, src='en', dest='es').text

# Step 3: Text-to-Speech
tts = gTTS(target_text, lang='es')
tts.save("output.mp3")

pygame.mixer.init()
pygame.mixer.music.load("output.mp3")
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
    continue




Creating an end-to-end speech-to-speech (S2S) model is more complex than the pipeline-based approach due to the deep learning models required for processing audio directly. Below is a simple prototype for an end-to-end speech-to-speech translation system, using libraries like Whisper (for ASR), Google Translate API (for translation), and Coqui TTS (for TTS).

In [None]:
#pip install whisper coqui-tts googletrans==4.0.0-rc1 pyaudio

import whisper
from googletrans import Translator
from TTS.api import TTS
import pyaudio
import wave
import os

# Step 1: Record Audio
def record_audio(filename="input.wav", duration=5):
    print("Recording...")
    chunk = 1024  # Record in chunks of 1024 samples
    sample_format = pyaudio.paInt16  # 16 bits per sample
    channels = 1
    rate = 44100  # Sample rate

    p = pyaudio.PyAudio()  # Create an interface to PortAudio
    stream = p.open(format=sample_format, channels=channels,
                    rate=rate, input=True, frames_per_buffer=chunk)
    frames = []

    # Record for the specified duration
    for _ in range(0, int(rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    p.terminate()

    # Save the recorded audio
    wf = wave.open(filename, "wb")
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(rate)
    wf.writeframes(b"".join(frames))
    wf.close()
    print("Recording complete.")

# Step 2: Transcribe Speech
def transcribe_audio(filename="input.wav"):
    print("Transcribing...")
    model = whisper.load_model("base")  # Whisper model
    result = model.transcribe(filename)
    print("Transcription:", result["text"])
    return result["text"]

# Step 3: Translate Text
def translate_text(text, src_lang="en", target_lang="es"):
    print("Translating...")
    translator = Translator()
    translation = translator.translate(text, src=src_lang, dest=target_lang)
    print("Translation:", translation.text)
    return translation.text

# Step 4: Synthesize Speech
def synthesize_speech(text, lang="es", output_file="output.wav"):
    print("Synthesizing speech...")
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
    tts.tts_to_file(text=text, file_path=output_file)
    print(f"Audio saved to {output_file}")

# Step 5: Play Audio
def play_audio(filename="output.wav"):
    print("Playing audio...")
    os.system(f"start {filename}")  # For Windows
    # Use `open` for macOS or `xdg-open` for Linux

# Main Function
if __name__ == "__main__":
    # Record audio from the microphone
    record_audio()

    # Transcribe speech to text
    source_text = transcribe_audio()

    # Translate the transcribed text
    translated_text = translate_text(source_text, src_lang="en", target_lang="es")

    # Synthesize speech from the translated text
    synthesize_speech(translated_text, lang="es")

    # Play the synthesized audio
    play_audio()
