<a href="https://colab.research.google.com/github/DebasishTripathy13/unimeds/blob/main/Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Installing required packages...")
!pip install --quiet faster-whisper
!pip install --quiet gradio
!pip install --quiet soundfile
!pip install --quiet gtts
!pip install --quiet pydub
!pip install --humanize
!pip install --quiet git+https://github.com/guillaumekln/faster-whisper.git
!pip install --quiet transformers datasets sacremoses librosa psutil humanize
!pip install --quiet huggingface_hub


print("Installation complete. Please restart the runtime (Runtime -> Restart runtime) and then run the next cell.")

In [None]:
import shutil
import os

# Update the cache path to target the currently used model
cache_path = "/root/.cache/huggingface/hub/models--vasista22--whisper-hindi-small"

# Check if the path exists before attempting to delete it
if os.path.exists(cache_path):
    print(f"Removing corrupted cache at: {cache_path}")
    shutil.rmtree(cache_path)
    print("Cache removed successfully. Please run your transcription code again.")
else:
    print("Cache directory not found. No action needed.")

In [None]:
import time
import os
from google.colab import files
import faster_whisper
import soundfile as sf
import io
import torch
import psutil  # Import psutil for memory usage
import humanize  # pip install humanize to make memory output readable
import librosa # To get audio duration

# --- 2. Audio File Upload ---
print("\nPlease select your Hindi audio MP3 file...")
uploaded = files.upload()

# Assuming only one file is uploaded, get its name
if not uploaded:
    print("No file uploaded. Please upload an audio file.")
    exit()

audio_filename = list(uploaded.keys())[0]
print(f"'{audio_filename}' uploaded successfully.")

# --- 3. Define the Speech-to-Text (STT) Function with Measurements ---

def process_audio_with_whisper(audio_path: str):
    """
    Processes an audio file using faster-whisper for STT and measures performance.

    Args:
        audio_path (str): The path to the audio file.

    Returns:
        tuple: A tuple containing:
            - str: The transcribed text.
            - float: Time taken for transcription in seconds.
            - dict: A dictionary of performance metrics including time, RAM, and VRAM usage.
    """
    print(f"\nStarting transcription and translation for: {audio_path}")

    # Measure initial RAM and VRAM usage
    process = psutil.Process(os.getpid())
    initial_ram = process.memory_info().rss
    initial_vram = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

    model_size = "large-v2"
    print(f"Loading Faster-Whisper model ({model_size})...")
    start_model_load_time = time.time()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # To measure VRAM, we need to clear the cache before loading the model
    if device == "cuda":
        torch.cuda.empty_cache()
    model = faster_whisper.WhisperModel(model_size, device=device, compute_type="int8")
    end_model_load_time = time.time()
    model_load_time = end_model_load_time - start_model_load_time
    print(f"Model loaded in {model_load_time:.2f} seconds.")

    # Measure RAM and VRAM usage after model loading
    ram_after_load = process.memory_info().rss
    vram_after_load = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

    # Perform translation
    start_transcription_time = time.time()
    # It's good practice to get the audio duration for context
    try:
        audio_duration = librosa.get_duration(filename=audio_path)
    except Exception as e:
        print(f"Could not get audio duration: {e}")
        audio_duration = None

    segments, info = model.transcribe(audio_path, beam_size=5, language="hi", task="translate")
    end_transcription_time = time.time()
    translation_time = end_transcription_time - start_transcription_time

    # Measure RAM and VRAM usage after transcription
    ram_after_transcribe = process.memory_info().rss
    vram_after_transcribe = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

    translated_text = ""
    print("\nTranslation results (English text):")
    for segment in segments:
        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
        translated_text += segment.text + " "

    # --- Performance Metrics Dictionary ---
    metrics = {
        "model_load_time": model_load_time,
        "translation_time": translation_time,
        "total_processing_time": model_load_time + translation_time,
        "initial_ram_usage": humanize.naturalsize(initial_ram),
        "peak_ram_usage": humanize.naturalsize(ram_after_transcribe),
        "ram_consumed_by_model": humanize.naturalsize(ram_after_load - initial_ram),
        "ram_consumed_by_transcription": humanize.naturalsize(ram_after_transcribe - ram_after_load),
    }

    if device == "cuda":
        metrics["initial_vram_usage"] = humanize.naturalsize(initial_vram)
        metrics["peak_vram_usage"] = humanize.naturalsize(vram_after_transcribe)
        metrics["vram_consumed_by_model"] = humanize.naturalsize(vram_after_load - initial_vram)
        metrics["vram_consumed_by_transcription"] = humanize.naturalsize(vram_after_transcribe - vram_after_load)

    if audio_duration:
        metrics["audio_duration"] = audio_duration
        metrics["real_time_factor"] = translation_time / audio_duration

    return translated_text.strip(), translation_time, metrics

# --- 4. Execute the STT Process and Display Results ---
print("\n--- Starting Audio Processing (Hindi to English Translation) ---")
try:
    transcribed_text, time_taken, performance_metrics = process_audio_with_whisper(audio_filename)

    print("\n--- Summary of Run ---")
    print(f"**Translated Text (English):**\n{transcribed_text}")
    print("\n**Performance Metrics:**")
    print(f"- **Total Translation Time:** {performance_metrics['translation_time']:.2f} seconds")
    print(f"- **Total Processing Time (including model load):** {performance_metrics['total_processing_time']:.2f} seconds")

    if 'audio_duration' in performance_metrics:
        print(f"- **Audio Duration:** {performance_metrics['audio_duration']:.2f} seconds")
        print(f"- **Real-Time Factor (RTF):** {performance_metrics['real_time_factor']:.2f} (a value < 1 is faster than real-time)")

    print("\n**Memory Usage:**")
    print(f"- **Initial RAM Usage:** {performance_metrics['initial_ram_usage']}")
    print(f"- **Peak RAM Usage:** {performance_metrics['peak_ram_usage']}")
    print(f"- **RAM Consumed by Model:** {performance_metrics['ram_consumed_by_model']}")
    print(f"- **RAM Consumed During Transcription:** {performance_metrics['ram_consumed_by_transcription']}")

    if 'peak_vram_usage' in performance_metrics:
        print(f"\n**VRAM (GPU Memory) Usage:**")
        print(f"- **Initial VRAM Usage:** {performance_metrics['initial_vram_usage']}")
        print(f"- **Peak VRAM Usage:** {performance_metrics['peak_vram_usage']}")
        print(f"- **VRAM Consumed by Model:** {performance_metrics['vram_consumed_by_model']}")
        print(f"- **VRAM Consumed During Transcription:** {performance_metrics['vram_consumed_by_transcription']}")

    print("\n**Note on Accuracy/Precision (for Translation):** To measure translation accuracy (e.g., BLEU score), you need to provide a 'ground truth' (a manually verified correct English translation) of your audio. Without it, these metrics cannot be calculated.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure your audio file is valid, necessary packages are installed, and your Colab runtime is set to GPU.")

In [None]:
import time
import os
from google.colab import files
import faster_whisper
import soundfile as sf
import io
import torch
import psutil
import humanize
import librosa
import getpass

from huggingface_hub import login
from transformers import pipeline

# ==============================
# 1. Hugging Face Authentication (Optional)
# ==============================
def setup_huggingface_login():
    print("=== Hugging Face Authentication ===")
    print("Get your token from: https://huggingface.co/settings/tokens")

    use_token = input("\nDo you want to login with a Hugging Face token? (y/n): ").lower().strip()
    global hf_token
    hf_token = None

    if use_token in ['y', 'yes']:
        token = getpass.getpass("Enter your Hugging Face access token: ")
        if token.strip():
            try:
                login(token=token.strip())
                hf_token = token.strip()
                print("✅ Successfully logged in to Hugging Face!")
                return True
            except Exception as e:
                print(f"❌ Login failed: {str(e)}")
                print("Continuing without authentication...")
                return False
        else:
            print("Empty token provided. Continuing without authentication...")
            return False
    else:
        print("Continuing without Hugging Face authentication...")
        return False

hf_authenticated = setup_huggingface_login()

# ==============================
# 2. Audio Upload
# ==============================
print("\n" + "="*50)
print("AUDIO FILE UPLOAD")
print("="*50)
print("Please select your Hindi audio MP3 file...")
uploaded = files.upload()

if not uploaded:
    print("❌ No file uploaded. Please upload an audio file.")
    exit()

audio_filename = list(uploaded.keys())[0]
print(f"✅ '{audio_filename}' uploaded successfully.")

# ==============================
# 3. Transcription with Collabora Hindi Model
# ==============================
def process_audio_with_collabora(audio_path: str):
    print(f"\n{'='*50}")
    print("TRANSCRIPTION PROCESS")
    print(f"{'='*50}")
    print(f"🎵 Processing audio file: {audio_path}")
    print("🤖 Using model: collabora/faster-whisper-small-hindi")

    process = psutil.Process(os.getpid())
    initial_ram = process.memory_info().rss
    print(f"\n📊 Initial RAM: {humanize.naturalsize(initial_ram)}")

    device = "cpu"
    print(f"   Device: {device}")

    start_model_load_time = time.time()
    try:
        # Load Collabora's Hindi Whisper model (SMALL)
        model = faster_whisper.WhisperModel("collabora/faster-whisper-small-hindi",
                                            device=device,
                                            compute_type="int8")
        model_load_time = time.time() - start_model_load_time
        print(f"✅ Model loaded in {model_load_time:.2f} seconds.")

        start_transcription_time = time.time()
        audio_duration = librosa.get_duration(filename=audio_path)
        print(f"\n🎵 Audio duration: {audio_duration:.2f} seconds")

        segments, info = model.transcribe(
            audio_path,
            language="hi",
            beam_size=5,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500)
        )

        transcribed_text = " ".join([segment.text for segment in segments])
        transcription_time = time.time() - start_transcription_time

        metrics = {
            "model_name": "collabora/faster-whisper-small-hindi",
            "device": device,
            "model_load_time": model_load_time,
            "transcription_time": transcription_time,
            "total_time": model_load_time + transcription_time,
            "segments_processed": len(transcribed_text.split())
        }

        return transcribed_text.strip(), transcription_time, metrics, info
    except Exception as e:
        print(f"❌ Error during transcription: {str(e)}")
        return None, None, None, None

result = process_audio_with_collabora(audio_filename)

# ==============================
# 4. Translation (Always Run)
# ==============================
def translate_hindi_to_english(text):
    print("\n🌐 Translating via Hugging Face Transformers...")
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en", device=-1)
    translated = translator(text, max_length=512)
    return translated[0]['translation_text']

# ==============================
# 5. Save and Download Results
# ==============================
if result[0] is not None:
    transcribed_text, transcription_time, metrics, info = result

    print(f"\n{'='*50}")
    print("TRANSCRIBED TEXT (Hindi)")
    print(f"{'='*50}")
    print(transcribed_text)

    english_translation = translate_hindi_to_english(transcribed_text)
    print(f"\n{'='*50}")
    print("TRANSLATED TEXT (English)")
    print(f"{'='*50}")
    print(english_translation)

    output_filename = f"transcription_translation_smallhindi_{int(time.time())}.txt"
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(f"Model: collabora/faster-whisper-small-hindi\n")
        f.write(f"Audio File: {audio_filename}\n")
        f.write(f"Transcription Time: {transcription_time:.2f} seconds\n")
        if info:
            f.write(f"Detected Language: {info.language}\n")
            f.write(f"Language Probability: {info.language_probability:.2f}\n")
        f.write("\n" + "="*50 + "\n")
        f.write("TRANSCRIBED TEXT (Hindi)\n")
        f.write("="*50 + "\n")
        f.write(transcribed_text + "\n")
        f.write("\n" + "="*50 + "\n")
        f.write("TRANSLATED TEXT (English)\n")
        f.write("="*50 + "\n")
        f.write(english_translation + "\n")

    print(f"\n💾 Saved to: {output_filename}")
    try:
        files.download(output_filename)
    except:
        pass
else:
    print("❌ Transcription failed. Please check your audio file and try again.")

print(f"\n{'='*50}")
print("PROCESS COMPLETED")
print(f"{'='*50}")
