<a href="https://colab.research.google.com/github/DebasishTripathy13/unimeds/blob/main/Whispershort.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Hindi to English Speech-to-Text using Faster-Whisper on Google Colab
# This notebook transcribes Hindi audio and translates it to English with improved accuracy and large file handling.

# Install required packages (all in one go)
!pip install --quiet faster-whisper
!pip install --quiet gradio # Keep gradio installed as its part of the previous script, though not used in this direct execution
!pip install --quiet soundfile
!pip install --quiet gtts
!pip install --quiet pydub # For audio manipulation (optional, but good for large files)

import faster_whisper
import gradio as gr # Still imported, but not used for direct execution
import torch
import librosa
import numpy as np
import os
from IPython.display import Audio, display # For playing audio directly in Colab
from google.colab import files # IMPORTANT for uploading!
import io
from gtts import gTTS
import soundfile as sf # Explicitly import soundfile for sf.write

# --- Configuration ---
MODEL_SIZE = "small" # Recommended for best accuracy
if torch.cuda.is_available():
    COMPUTE_TYPE = "float16"
    DEVICE = "cuda"
    print(f"Using GPU ({torch.cuda.get_device_name(0)}) with {COMPUTE_TYPE} precision.")
else:
    COMPUTE_TYPE = "int8"
    DEVICE = "cpu"
    print(f"Using CPU with {COMPUTE_TYPE} precision.")

# Load Faster-Whisper model (this will download if not cached)
print(f"Loading Faster-Whisper model: {MODEL_SIZE}...")
try:
    model = faster_whisper.WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
    print(f"Successfully loaded {MODEL_SIZE} model on {DEVICE}!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Attempting to load on CPU as a fallback...")
    model = faster_whisper.WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
    DEVICE = "cpu"
    print(f"Successfully loaded {MODEL_SIZE} model on CPU!")

# --- Transcription Function ---
def transcribe_hindi_to_english(audio_path_or_array, language="hi"):
    """
    Transcribe Hindi audio and translate it to English using Faster-Whisper.
    Handles both file paths and numpy arrays. Automatically chunks large audio files for processing.

    Args:
        audio_path_or_array: Path to audio file (string) or audio array (tuple from Gradio).
        language (str): Source language (default: "hi" for Hindi).

    Returns:
        dict: Contains transcribed and translated text, detected language, and confidence.
    """
    try:
        # If it's a Gradio numpy input, convert to a temporary file
        if isinstance(audio_path_or_array, tuple):
            sample_rate, audio_data = audio_path_or_array
            temp_audio_file = "temp_gradio_audio.wav"

            # Ensure audio data is float32 for soundfile.write
            if audio_data.dtype in [np.int16, np.int32]:
                audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
            elif audio_data.dtype == np.uint8:
                 audio_data = (audio_data.astype(np.float32) - 128) / 128.0


            if sample_rate != 16000:
                audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
                sample_rate = 16000
            if len(audio_data.shape) > 1: # Convert stereo to mono
                audio_data = np.mean(audio_data, axis=1)

            sf.write(temp_audio_file, audio_data, sample_rate)
            audio_source = temp_audio_file
        else:
            audio_source = audio_path_or_array # It's already a file path

        print(f"Starting transcription for: {audio_source}")

        segments, info = model.transcribe(
            audio=audio_source,
            language=language,
            task="translate",
            beam_size=5,
            vad_filter=True,
            vad_parameters={"min_silence_duration_ms": 500},
            without_timestamps=False
        )

        full_translation = ""
        # You can also get word-level timestamps if needed, for "word to word conversion" insight
        word_level_transcription = []

        print(f"Detected language: {info.language} with probability {info.language_probability:.4f}")

        for segment in segments:
            full_translation += segment.text.strip() + " "
            # Uncomment below to collect word-level details for "word to word" inspection
            # for word in segment.words:
            #    word_level_transcription.append(f"'{word.word}' ({word.start:.2f}s-{word.end:.2f}s)")

        # Clean up temporary audio file if created
        if 'temp_audio_file' in locals() and os.path.exists(temp_audio_file):
            os.remove(temp_audio_file)

        return {
            "detected_language": info.language,
            "confidence": f"{info.language_probability:.2%}",
            "english_translation": full_translation.strip(),
            # "word_level_transcription": "\n".join(word_level_transcription)
        }

    except Exception as e:
        print(f"Error during transcription: {e}")
        if 'temp_audio_file' in locals() and os.path.exists(temp_audio_file):
            os.remove(temp_audio_file)
        return {
            "error": f"Transcription failed: {str(e)}",
            "detected_language": "Unknown",
            "confidence": "0%",
            "english_translation": ""
        }

# --- Helper Functions for Direct Testing in Colab (These are included in your main script) ---
def create_sample_hindi_audio(filename="sample_hindi.mp3", text="नमस्ते, मैं एक परीक्षण संदेश हूं। यह लंबी ऑडियो फाइल के लिए है।"):
    try:
        tts = gTTS(text=text, lang='hi')
        tts.save(filename)
        print(f"✅ Sample Hindi audio created: '{filename}'")
        display(Audio(filename, autoplay=False))
        return filename
    except Exception as e:
        print(f"❌ Error creating sample audio: {e}")
        return None

def upload_and_test_audio():
    print("\n📁 Please select your Hindi audio MP3 file...")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No file uploaded.")
        return
    filename = list(uploaded.keys())[0]
    print(f"\n🎯 Processing uploaded file: '{filename}'")
    result = transcribe_hindi_to_english(filename)
    print("\n" + "="*60)
    print("🎯 TRANSCRIPTION RESULTS")
    print("="*60)
    if 'error' in result:
        print(f"Error: {result['error']}")
    else:
        print(f"📁 File: {filename}")
        print(f"🗣️ Detected Language: {result.get('detected_language', 'N/A')}")
        print(f"📊 Confidence: {result.get('confidence', 'N/A')}")
        print(f"\n🌐 ENGLISH TRANSLATION:")
        print("-" * 40)
        print(f"'{result.get('english_translation', 'No translation available')}'")
    print("="*60)
    return result

def test_audio_file_directly(file_path):
    if not os.path.exists(file_path):
        print(f"❌ Error: File '{file_path}' not found!")
        print("Make sure the file is uploaded to Colab or check the path.")
        return

    print(f"\n🎯 Processing file: '{file_path}'")
    result = transcribe_hindi_to_english(file_path)

    print("\n" + "="*60)
    print("🎯 TRANSCRIPTION RESULTS")
    print("="*60)
    if 'error' in result:
        print(f"Error: {result['error']}")
    else:
        print(f"📁 File: {file_path}")
        print(f"🗣️ Detected Language: {result.get('detected_language', 'N/A')}")
        print(f"📊 Confidence: {result.get('confidence', 'N/A')}")
        print(f"\n🌐 ENGLISH TRANSLATION:")
        print("-" * 40)
        print(f"'{result.get('english_translation', 'No translation available')}'")
    print("="*60)
    return result

# --- Main Execution Block (for direct use without Gradio interface launch) ---
if __name__ == "__main__":
    print("\n" + "="*70)
    print("🚀 Hindi to English Speech Translator Ready for Direct Use!")
    print("="*70)
    print(f"Model: {MODEL_SIZE}, Device: {DEVICE}, Compute Type: {COMPUTE_TYPE}")
    print("\n💡 To run your downloaded audio file, execute `test_audio_file_directly('your_file_name.mp3')` in a new cell.")
    print("="*70)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing CPU with int8 precision.
Loading Faster-Whisper model: small...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]

Successfully loaded small model on cpu!

🚀 Hindi to English Speech Translator Ready for Direct Use!
Model: small, Device: cpu, Compute Type: int8

💡 To run your downloaded audio file, execute `test_audio_file_directly('your_file_name.mp3')` in a new cell.


In [4]:
from google.colab import files

print("Please select your Hindi audio MP3 file...")
uploaded = files.upload()

# After running this, a "Choose Files" button will appear.
# Click it, navigate to your MP3 file, and select it.
# Once uploaded, it will show a message like:
# 'your_audio.mp3' (audio/mpeg) - 123456 bytes uploaded.

Please select your Hindi audio MP3 file...


Saving my_hindi_audio.mp3 to my_hindi_audio.mp3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
# Assuming your file is named 'my_hindi_speech.mp3'
test_audio_file_directly('my_hindi_audio.mp3')



🎯 Processing file: 'my_hindi_audio.mp3'
Starting transcription for: my_hindi_audio.mp3
Detected language: hi with probability 1.0000

🎯 TRANSCRIPTION RESULTS
📁 File: my_hindi_audio.mp3
🗣️ Detected Language: hi
📊 Confidence: 100.00%

🌐 ENGLISH TRANSLATION:
----------------------------------------
'Once upon a time, there was a farmer in a village He was very fond of raising animals He had a lot of buffaloes He used to sell milk and run his own house One day, the farmer took care of a frog and a monkey After a long time The farmer thought of playing with the frog and monkey The farmer made a lot of small pots in his field Then he put soil on those pots and closed them In one pot, the farmer hid a carrot The farmer asked the farmer and monkey to look for the carrot The farmer was very confident and he believed in himself He started digging each pot and started looking for the carrot But the monkey was very negative and lazy He thought that there are so many pots in the field But the carr

{'detected_language': 'hi',
 'confidence': '100.00%',
 'english_translation': "Once upon a time, there was a farmer in a village He was very fond of raising animals He had a lot of buffaloes He used to sell milk and run his own house One day, the farmer took care of a frog and a monkey After a long time The farmer thought of playing with the frog and monkey The farmer made a lot of small pots in his field Then he put soil on those pots and closed them In one pot, the farmer hid a carrot The farmer asked the farmer and monkey to look for the carrot The farmer was very confident and he believed in himself He started digging each pot and started looking for the carrot But the monkey was very negative and lazy He thought that there are so many pots in the field But the carrot is in one pot Who will look for it? The monkey went to the pot and slept The farmer was looking for the carrot The farmer looked at all the pots But the carrot was not found in any pot There was only one pot left The 

# New Section