In [1]:
pip install -U openai-whisper

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
! pip install gradio==3.50.2

In [None]:
!pip install sentence_transformers

In [None]:
import whisper
import torch
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from sentence_transformers import SentenceTransformer, util


In [None]:
# Load the Whisper model
whisper_model = whisper.load_model("base")

In [None]:
# Load Sentence Transformer for retrieval
retriever_model = SentenceTransformer('all-MiniLM-L6-v2')



In [None]:
# Dummy RAG document store
documents = {
    "doc1": "This is a document about artificial intelligence and machine learning.",
    "doc2": "This document describes the basics of deep learning and neural networks.",
    "doc3": "Here we discuss the impact of AI on different industries like healthcare, finance, and more.",
    "doc4": "The future of technology includes advancements in AI, quantum computing, and other fields.",
    "doc5": [
        "The future of AI is incredibly promising, marked by rapid advancements and transformative potential across various sectors.",
        "AI is expected to revolutionize industries such as healthcare, finance, transportation, and education by enabling more efficient and accurate decision-making processes.",
        "In healthcare, AI can aid in early diagnosis, personalized treatment plans, and even robotic surgeries.",
        "In finance, AI-driven algorithms can detect fraudulent activities and optimize trading strategies.",
        "The transportation sector is likely to see the widespread adoption of autonomous vehicles, improving safety and reducing congestion.",
        "Education will benefit from personalized learning experiences tailored to individual student needs."
    ]
}

# Flatten the document content for encoding
all_document_texts = []
for key, value in documents.items():
    if isinstance(value, list):
        all_document_texts.extend(value)
    else:
        all_document_texts.append(value)

# Encode the documents using the retriever model
document_embeddings = retriever_model.encode(all_document_texts, convert_to_tensor=True)


In [None]:

# Function to transcribe speech using Whisper
def transcribe_speech(file_path):
    result = whisper_model.transcribe(file_path)
    return result["text"]

# Function to detect language
def detect_language(audio_path):
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    _, probs = whisper_model.detect_language(mel)
    detected_language_code = max(probs, key=probs.get)

    # Map detected language codes to readable names
    language_mapping = {
        'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German',
        'hi': 'Hindi', 'ja': 'Japanese', 'ru': 'Russian', 'ar': 'Arabic',
        'te': 'Telugu', 'zh': 'Chinese', 'pt': 'Portuguese'
    }

    return language_mapping.get(detected_language_code, detected_language_code).capitalize()



In [None]:
# Function to load translation model and tokenizer
def load_translation_model(language):
    model_name = {
        "Hindi": "Helsinki-NLP/opus-mt-en-hi",
        "Spanish": "Helsinki-NLP/opus-mt-en-es",
        "Japanese": "Helsinki-NLP/opus-mt-en-jap",
        "German": "Helsinki-NLP/opus-mt-en-de",
        "Russian": "Helsinki-NLP/opus-mt-en-ru",
        "Arabic": "Helsinki-NLP/opus-mt-en-ar",
        "Telugu": "Helsinki-NLP/opus-mt-en-te",
        "French": "Helsinki-NLP/opus-mt-en-fr",
        "Italian": "Helsinki-NLP/opus-mt-en-it",
        "English": "Helsinki-NLP/opus-mt-xx-en"  # Used for translating any language to English
    }

    if language not in model_name:
        raise ValueError(f"Translation model for {language} not available.")

    translation_model = MarianMTModel.from_pretrained(model_name[language])
    translation_tokenizer = MarianTokenizer.from_pretrained(model_name[language])
    return translation_model, translation_tokenizer



In [None]:
# Function to translate text
def translate_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    with torch.no_grad():
        translated_tokens = model.generate(**inputs)
    translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translation



In [None]:

# Function to retrieve a document based on the query
def retrieve_document(query):
    query_embedding = retriever_model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
    top_score_idx = scores.argmax().item()
    return list(documents.values())[top_score_idx]



In [None]:
# Function to process the audio file and return transcriptions and translations
def process_audio(audio, target_language):
    # Transcribe the audio
    transcription = transcribe_speech(audio)

    # Detect the language spoken in the audio
    detected_language = detect_language(audio)

    # Load the appropriate translation model
    translation_model, translation_tokenizer = load_translation_model(target_language)

    # Translate the transcribed text
    translated_text = translate_text(transcription, translation_model, translation_tokenizer)

    # Retrieve document based on the transcribed text
    retrieved_document = retrieve_document(transcription)

    return transcription, detected_language, translated_text, retrieved_document

# Create the Gradio interface
iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(source="upload", type="filepath"),
        gr.Dropdown(["Hindi", "Spanish", "Japanese", "German", "Russian", "Arabic", "French", "Italian", "English"], label="Target Language")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Detected Language"),
        gr.Textbox(label="Translation"),
        gr.Textbox(label="Retrieved Document")
    ],
    title="Multilingual Speech Recognition, Translation, and Document Retrieval",
    description="Upload an audio file in any language, select a target language to get the transcription, translation, and retrieve a document based on the transcription."
)

# Launch the Gradio interface
iface.launch()