<a href="https://colab.research.google.com/github/DebasishTripathy13/unimeds/blob/main/Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Hindi to English Speech-to-Text using Whisper on Google Colab
# This notebook transcribes Hindi audio and translates it to English

# Step 1: Install required packages
!pip install openai-whisper
!pip install gradio
!pip install librosa
!pip install soundfile

# Step 2: Import necessary libraries
import whisper
import gradio as gr
import numpy as np
import torch
import librosa
from IPython.display import Audio, display
import os

# Step 3: Load Whisper model
# Available models: tiny, base, small, medium, large
# Larger models are more accurate but slower
print("Loading Whisper model...")
model_size = "base"  # Change to "medium" or "large" for better accuracy
model = whisper.load_model(model_size)
print(f"Loaded {model_size} model successfully!")

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 4: Define transcription function
def transcribe_hindi_to_english(audio_file):
    """
    Transcribe Hindi audio to English text using Whisper

    Args:
        audio_file: Path to audio file or audio array

    Returns:
        dict: Contains transcribed and translated text
    """
    try:
        # Load and preprocess audio
        if isinstance(audio_file, str):
            # If it's a file path
            audio = whisper.load_audio(audio_file)
        else:
            # If it's already an audio array (from Gradio)
            audio = audio_file

        # Pad or trim audio to 30 seconds (Whisper's input length)
        audio = whisper.pad_or_trim(audio)

        # Convert to log-mel spectrogram
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language (optional - we know it's Hindi)
        _, probs = model.detect_language(mel)
        detected_lang = max(probs, key=probs.get)
        print(f"Detected language: {detected_lang}")

        # Transcribe with translation to English
        options = whisper.DecodingOptions(
            language="hi",  # Hindi language code
            task="translate",  # This translates to English
            fp16=False if device == "cpu" else True
        )

        result = whisper.decode(model, mel, options)

        return {
            "detected_language": detected_lang,
            "confidence": f"{max(probs.values()):.2%}",
            "english_translation": result.text.strip()
        }

    except Exception as e:
        return {
            "error": f"Transcription failed: {str(e)}",
            "detected_language": "Unknown",
            "confidence": "0%",
            "english_translation": ""
        }

# Step 5: Test with sample audio (if you have one)
def test_transcription(audio_path):
    """Test the transcription function"""
    if os.path.exists(audio_path):
        result = transcribe_hindi_to_english(audio_path)
        print("Transcription Results:")
        print(f"Detected Language: {result.get('detected_language', 'N/A')}")
        print(f"Confidence: {result.get('confidence', 'N/A')}")
        print(f"English Translation: {result.get('english_translation', 'N/A')}")
        if 'error' in result:
            print(f"Error: {result['error']}")
    else:
        print(f"Audio file not found: {audio_path}")

# Step 6: Create Gradio interface for easy usage
def create_gradio_interface():
    """Create a web interface using Gradio"""

    def process_audio(audio_input):
        if audio_input is None:
            return "Please upload an audio file.", "", ""

        # Get sample rate and audio data
        sample_rate, audio_data = audio_input

        # Convert to float32 and normalize
        if audio_data.dtype == np.int16:
            audio_data = audio_data.astype(np.float32) / 32768.0
        elif audio_data.dtype == np.int32:
            audio_data = audio_data.astype(np.float32) / 2147483648.0

        # Resample to 16kHz (Whisper's expected sample rate)
        if sample_rate != 16000:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

        # Handle stereo audio (convert to mono)
        if len(audio_data.shape) > 1:
            audio_data = np.mean(audio_data, axis=1)

        result = transcribe_hindi_to_english(audio_data)

        if 'error' in result:
            return result['error'], "", ""

        return (
            result.get('english_translation', 'No translation available'),
            result.get('detected_language', 'Unknown'),
            result.get('confidence', '0%')
        )

    # Create Gradio interface
    interface = gr.Interface(
        fn=process_audio,
        inputs=gr.Audio(
            sources=["microphone", "upload"],
            type="numpy",
            label="Upload Hindi Audio or Record"
        ),
        outputs=[
            gr.Textbox(label="English Translation", lines=5),
            gr.Textbox(label="Detected Language"),
            gr.Textbox(label="Confidence")
        ],
        title="Hindi to English Speech Translator",
        description="Upload Hindi audio or record directly to get English translation using Whisper AI",
        examples=[
            # Add example audio files here if you have them
        ]
    )

    return interface

# Step 7: Launch the interface
print("Creating Gradio interface...")
interface = create_gradio_interface()

# Launch with public sharing (optional)
# Set share=True to get a public URL, share=False for local only
print("Launching interface...")
interface.launch(share=True, debug=True)

# Step 8: Upload and test audio file

# Method 1: Upload file through Colab interface
from google.colab import files
import io

def upload_and_test():
    """Upload audio file and test transcription"""
    print("Please select your Hindi audio file...")
    uploaded = files.upload()

    for filename in uploaded.keys():
        print(f"\nTesting file: {filename}")
        result = transcribe_hindi_to_english(filename)

        print("="*50)
        print("TRANSCRIPTION RESULTS:")
        print("="*50)
        print(f"Detected Language: {result.get('detected_language', 'N/A')}")
        print(f"Confidence: {result.get('confidence', 'N/A')}")
        print(f"English Translation:")
        print(f"'{result.get('english_translation', 'No translation available')}'")

        if 'error' in result:
            print(f"Error: {result['error']}")

        print("="*50)

# Method 2: Direct file path testing
def test_audio_file(file_path):
    """
    Test a specific audio file
    Usage: test_audio_file("my_hindi_audio.wav")
    """
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found!")
        print("Make sure the file is uploaded to Colab or check the path.")
        return

    print(f"Processing: {file_path}")
    result = transcribe_hindi_to_english(file_path)

    print("="*50)
    print("TRANSCRIPTION RESULTS:")
    print("="*50)
    print(f"File: {file_path}")
    print(f"Detected Language: {result.get('detected_language', 'N/A')}")
    print(f"Confidence: {result.get('confidence', 'N/A')}")
    print(f"English Translation:")
    print(f"'{result.get('english_translation', 'No translation available')}'")

    if 'error' in result:
        print(f"Error: {result['error']}")

    print("="*50)
    return result

# Method 3: Test with sample audio (if available)
def create_sample_audio():
    """Create a sample audio file for testing (text-to-speech)"""
    try:
        from gtts import gTTS
        import pygame

        # Create sample Hindi text
        hindi_text = "नमस्ते, मैं एक परीक्षण संदेश हूं"

        tts = gTTS(text=hindi_text, lang='hi')
        tts.save("sample_hindi.mp3")

        print("Sample Hindi audio created: sample_hindi.mp3")
        print("You can now test with: test_audio_file('sample_hindi.mp3')")

    except ImportError:
        print("gTTS not installed. Install with: !pip install gtts pygame")

# Step 9: Batch processing function (bonus)
def batch_transcribe(folder_path):
    """
    Process multiple audio files in a folder
    """
    results = []
    audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg']

    for filename in os.listdir(folder_path):
        if any(filename.lower().endswith(ext) for ext in audio_extensions):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            result = transcribe_hindi_to_english(file_path)
            results.append({
                'filename': filename,
                'result': result
            })

    return results

print("\n" + "="*50)
print("SETUP COMPLETE!")
print("="*50)
print("The Whisper model is loaded and ready to use.")
print("\nTESTING OPTIONS:")
print("="*50)
print("1. Web Interface: Use the Gradio interface above")
print("2. Upload & Test: Run upload_and_test()")
print("3. Direct File: Run test_audio_file('filename.wav')")
print("4. Create Sample: Run create_sample_audio()")
print("\nEXAMPLE USAGE:")
print("="*50)
print("# Upload file through Colab interface:")
print("upload_and_test()")
print("")
print("# Test specific file:")
print("test_audio_file('my_hindi_audio.wav')")
print("")
print("# Create sample audio for testing:")
print("create_sample_audio()")
print("\nSupported audio formats: WAV, MP3, M4A, FLAC, OGG")
print("The model will translate Hindi speech directly to English text.")

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->openai-whisper)
  Downloading nvidia_cudnn_cu12-9.1.0

100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 47.3MiB/s]


Loaded base model successfully!
Using device: cpu
Creating Gradio interface...
Launching interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://64c6f8039c3a10ed49.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Detected language: te
Detected language: la
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://64c6f8039c3a10ed49.gradio.live

SETUP COMPLETE!
The Whisper model is loaded and ready to use.

TESTING OPTIONS:
1. Web Interface: Use the Gradio interface above
2. Upload & Test: Run upload_and_test()
3. Direct File: Run test_audio_file('filename.wav')
4. Create Sample: Run create_sample_audio()

EXAMPLE USAGE:
# Upload file through Colab interface:
upload_and_test()

# Test specific file:
test_audio_file('my_hindi_audio.wav')

# Create sample audio for testing:
create_sample_audio()

Supported audio formats: WAV, MP3, M4A, FLAC, OGG
The model will translate Hindi speech directly to English text.


In [3]:
# Simple Hindi-to-English Whisper Frontend
# Just upload a file and get results

import whisper
import gradio as gr
import torch
import librosa
import numpy as np
import soundfile as sf
import os

# Load Whisper model (only once)
if 'model' not in globals():
    print("🤖 Loading Whisper model...")
    try:
        model = whisper.load_model("base")  # Start with base model for stability
        print("✅ Model loaded successfully!")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("Trying to reload...")
        model = whisper.load_model("base", device="cpu")  # Force CPU if GPU fails
        print("✅ Model loaded on CPU!")

def transcribe_audio(audio_file):
    """
    Simple transcription function
    """
    if audio_file is None:
        return "❌ Please upload an audio file"

    try:
        # Get the file path
        file_path = audio_file.name if hasattr(audio_file, 'name') else audio_file

        print(f"🎯 Processing: {os.path.basename(file_path)}")

        # Load and preprocess audio
        audio = whisper.load_audio(file_path)
        audio = whisper.pad_or_trim(audio)

        # Create mel spectrogram
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language
        _, probs = model.detect_language(mel)
        detected_language = max(probs, key=probs.get)
        confidence = max(probs.values())

        # Transcribe and translate to English
        options = whisper.DecodingOptions(
            language="hi",      # Hindi input
            task="translate",   # Translate to English
            fp16=torch.cuda.is_available()
        )

        result = whisper.decode(model, mel, options)

        # Format results
        translation = result.text.strip()

        return f"""
🌐 **English Translation:**
{translation}

📊 **Detection Info:**
• Detected Language: {detected_language}
• Confidence: {confidence:.1%}
• Status: ✅ Success
        """

    except Exception as e:
        return f"❌ **Error:** {str(e)}"

# Create Gradio interface with file upload
def create_simple_interface():
    """Create a simple upload and transcribe interface"""

    def process_uploaded_file(audio_file):
        """Process the uploaded audio file"""
        if audio_file is None:
            return "❌ Please upload an audio file"

        try:
            # audio_file is the path to the uploaded file
            print(f"🎯 Processing uploaded file: {os.path.basename(audio_file)}")

            # Method 1: Use Whisper's built-in transcribe function (more reliable)
            result = model.transcribe(
                audio_file,
                language="hi",  # Hindi
                task="translate",  # Translate to English
                fp16=False,  # Disable FP16 to avoid GPU issues
                verbose=False
            )

            # Extract results
            translation = result["text"].strip()
            detected_language = result.get("language", "hi")

            return f"""🌐 ENGLISH TRANSLATION:
"{translation}"

📊 DETECTION INFO:
• File: {os.path.basename(audio_file)}
• Detected Language: {detected_language}
• Status: ✅ Success"""

        except Exception as e:
            # Fallback method if the above fails
            try:
                print("Trying alternative processing method...")

                # Load audio manually
                audio = whisper.load_audio(audio_file)
                audio = whisper.pad_or_trim(audio)

                # Create mel spectrogram
                mel = whisper.log_mel_spectrogram(audio)

                # Move to CPU if GPU causes issues
                if torch.cuda.is_available():
                    try:
                        mel = mel.to("cuda")
                        model_device = "cuda"
                    except:
                        mel = mel.to("cpu")
                        model_device = "cpu"
                else:
                    mel = mel.to("cpu")
                    model_device = "cpu"

                # Detect language
                _, probs = model.detect_language(mel)
                detected_language = max(probs, key=probs.get)
                confidence = max(probs.values())

                # Transcribe with safer options
                options = whisper.DecodingOptions(
                    language="hi",
                    task="translate",
                    fp16=False,  # Disable FP16
                    temperature=0.0
                )

                result = whisper.decode(model, mel, options)
                translation = result.text.strip()

                return f"""🌐 ENGLISH TRANSLATION:
"{translation}"

📊 DETECTION INFO:
• File: {os.path.basename(audio_file)}
• Detected Language: {detected_language}
• Confidence: {confidence:.1%}
• Processing: {model_device.upper()}
• Status: ✅ Success (Fallback method)"""

            except Exception as e2:
                return f"""❌ ERROR: {str(e)}

🔧 TROUBLESHOOTING:
• Original error: {str(e)}
• Fallback error: {str(e2)}
• Try a different audio file format
• Make sure the file is a valid audio file
• Audio should be less than 10 minutes for best results"""

    # Create the interface
    interface = gr.Interface(
        fn=process_uploaded_file,
        inputs=gr.Audio(
            label="📁 Upload Hindi Audio File",
            type="filepath",  # This gives us the file path
            sources=["upload"]  # Only allow file upload, not microphone
        ),
        outputs=gr.Textbox(
            label="🎯 Results",
            lines=10,
            show_copy_button=True,
            placeholder="Upload an audio file to see the English translation here..."
        ),
        title="🎵 Hindi → English Speech Translator",
        description="""
        **Simple Steps:**
        1. Click the upload area below
        2. Select your Hindi audio file (.mp3, .wav, .m4a, etc.)
        3. Wait for processing
        4. See the English translation!

        **Supported formats:** MP3, WAV, M4A, FLAC, OGG
        """,
        theme=gr.themes.Soft(),
        allow_flagging="never"
    )

    return interface

# Simple upload function for Colab
def upload_and_transcribe():
    """Simple upload function for Google Colab"""
    from google.colab import files

    print("📁 Select your Hindi audio file...")
    uploaded = files.upload()

    if not uploaded:
        print("❌ No file uploaded")
        return

    # Process the first uploaded file
    filename = list(uploaded.keys())[0]
    print(f"\n🎯 Processing: {filename}")

    try:
        # Load audio
        audio = whisper.load_audio(filename)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language
        _, probs = model.detect_language(mel)
        detected_language = max(probs, key=probs.get)
        confidence = max(probs.values())

        # Transcribe
        options = whisper.DecodingOptions(
            language="hi",
            task="translate",
            fp16=torch.cuda.is_available()
        )

        result = whisper.decode(model, mel, options)

        # Display results
        print("\n" + "="*60)
        print("🎯 TRANSCRIPTION RESULTS")
        print("="*60)
        print(f"📁 File: {filename}")
        print(f"🗣️  Detected Language: {detected_language}")
        print(f"📊 Confidence: {confidence:.1%}")
        print(f"\n🌐 ENGLISH TRANSLATION:")
        print("-" * 40)
        print(f'"{result.text.strip()}"')
        print("="*60)

        return result.text.strip()

    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# Direct transcription function
def transcribe_file(file_path):
    """Direct transcription function"""
    try:
        print(f"🎯 Transcribing: {file_path}")

        # Load and process
        audio = whisper.load_audio(file_path)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Transcribe
        options = whisper.DecodingOptions(
            language="hi",
            task="translate",
            fp16=torch.cuda.is_available()
        )

        result = whisper.decode(model, mel, options)

        print("✅ Transcription completed!")
        print(f"Result: \"{result.text.strip()}\"")

        return result.text.strip()

    except Exception as e:
        print(f"❌ Error: {e}")
        return None

print("🎯 SIMPLE WHISPER FRONTEND READY!")
print("="*50)
print("USAGE OPTIONS:")
print("1. Web Interface (Auto-launched) - Upload files in browser")
print("2. upload_and_transcribe() - Upload in Colab")
print("3. transcribe_file('path/to/file.mp3') - Direct function")

# Launch the simple interface
print("\n🚀 Launching web interface...")
print("📱 The interface will open with a public URL you can share!")
interface = create_simple_interface()
interface.launch(
    share=True,      # Creates public URL
    debug=False,     # Clean output
    quiet=True       # Less verbose
)

🎯 SIMPLE WHISPER FRONTEND READY!
USAGE OPTIONS:
1. Web Interface (Auto-launched) - Upload files in browser
2. upload_and_transcribe() - Upload in Colab
3. transcribe_file('path/to/file.mp3') - Direct function

🚀 Launching web interface...
📱 The interface will open with a public URL you can share!
* Running on public URL: https://d54f018318c599a60e.gradio.live


