In [None]:
!pip install groq gradio langdetect gtts

In [None]:
import os
import gradio as gr
from groq import Groq
from langdetect import detect
from gtts import gTTS
import base64

In [None]:
GROQ_API_KEY = input("Enter your Groq API key: ")
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
client = Groq(api_key=os.environ["GROQ_API_KEY"])

In [None]:
def process_text_input(text):
    if not text:
        return "Please provide text input.", None
    lang = detect(text)
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": text}],
            model="llama-3.3-70b-versatile",
            temperature=0.7,
            max_tokens=1024
        )
        response = chat_completion.choices[0].message.content
        return response, lang
    except Exception as e:
        return f"Error processing text: {str(e)}", None

In [None]:
def process_audio_input(audio):
    if not audio:
        return "Please provide audio input.", None
    try:
        with open(audio, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(audio, file.read()),
                model="whisper-large-v3",
                response_format="text"
            )
        lang = detect(transcription)
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": transcription}],
            model="llama-3.3-70b-versatile",
            temperature=0.7,
            max_tokens=1024
        ).choices[0].message.content
        return response, lang
    except Exception as e:
        return f"Error processing audio: {str(e)}", None

In [None]:
def process_image_input(image, text_prompt):
    if not image or not text_prompt:
        return "Please provide an image and a text prompt.", None
    try:
        with open(image, "rb") as img_file:
            image_data = base64.b64encode(img_file.read()).decode("utf-8")

        chat_completion = client.chat.completions.create(
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": text_prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
                ]
            }],
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            temperature=0.7,
            max_tokens=1024
        )
        response = chat_completion.choices[0].message.content
        lang = detect(text_prompt)
        return response, lang
    except Exception as e:
        return f"Error processing image: {str(e)}", None

In [1]:
def text_to_speech(text, lang):
    try:
        supported_langs = ['en', 'es', 'fr', 'de', 'it', 'zh-cn', 'ja', 'ko']
        if lang not in supported_langs:
            lang = 'en'
        tts = gTTS(text=text, lang=lang, slow=False)
        audio_file = "response.mp3"
        tts.save(audio_file)
        return audio_file
    except Exception as e:
        return None

In [None]:
def assistant(text_input, audio_input, image_input, image_prompt):
    response_text = ""
    audio_output = None

    if text_input:
        response_text, lang = process_text_input(text_input)
    elif audio_input:
        response_text, lang = process_audio_input(audio_input)
    elif image_input and image_prompt:
        response_text, lang = process_image_input(image_input, image_prompt)
    else:
        response_text = "Please provide at least one input (text, audio, or image with prompt)."
        lang = None

    if response_text and lang:
        audio_output = text_to_speech(response_text, lang)

    return response_text, audio_output

In [None]:
interface = gr.Interface(
    fn=assistant,
    inputs=[
        gr.Textbox(label="Type your message (e.g., 'Hola, ¿qué tal?')"),
        gr.Audio(label="Record or upload audio", type="filepath"),
        gr.Image(label="Upload an image", type="filepath"),
        gr.Textbox(label="Image prompt (e.g., 'What is this?')")
    ],
    outputs=[
        gr.Textbox(label="Response"),
        gr.Audio(label="Listen to Response", type="filepath")
    ],
    title="Multilingual AI Assistant Powered by Groq",
    description="Input text, audio, or an image with a prompt. Get responses in text and audio!",
    live=False
)

In [None]:
interface.launch(share=True)