<a href="https://colab.research.google.com/github/DebasishTripathy13/unimeds/blob/main/LongAudioOutput.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Installing required packages...")
!pip install --quiet faster-whisper
!pip install --quiet gradio
!pip install --quiet soundfile
!pip install --quiet gtts
!pip install --quiet pydub

print("Installation complete. Please restart the runtime (Runtime -> Restart runtime) and then run the next cell.")

Installing required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstallation complete. Please restart the runtime (Runtime -> Restart runtime) and then run the next cell.


In [14]:
import time
import os
from google.colab import files
import faster_whisper
import soundfile as sf
import io # To handle in-memory audio data if needed
import torch # Make sure torch is imported for the device check


# --- 2. Audio File Upload ---
print("\nPlease select your Hindi audio MP3 file...")
uploaded = files.upload()

# Assuming only one file is uploaded, get its name
if not uploaded:
    print("No file uploaded. Please upload an audio file.")
    # You might want to exit or handle this more robustly
    exit()

# Get the name of the first uploaded file
audio_filename = list(uploaded.keys())[0]
print(f"'{audio_filename}' uploaded successfully.")

# --- 3. Define the Speech-to-Text (STT) Function with Measurements ---

def process_audio_with_whisper(audio_path: str):
    """
    Processes an audio file using faster-whisper for STT and measures performance.

    Args:
        audio_path (str): The path to the audio file.

    Returns:
        tuple: A tuple containing:
            - str: The transcribed text.
            - float: Time taken for transcription in seconds.
            - (Optional) dict: Placeholder for accuracy/precision metrics
                               (requires ground truth for actual calculation).
    """
    print(f"\nStarting transcription and translation for: {audio_path}")
    model_size = "large-v2"  # "large-v3" is generally recommended for best accuracy.
                            # For translation, larger models tend to perform much better.
                            # Consider a smaller model for faster processing on less powerful GPUs,
                            # but expect a potential drop in translation quality.

    # Load the model (this can take some time the first time it's run)
    print(f"Loading Faster-Whisper model ({model_size})...")
    start_model_load_time = time.time()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = faster_whisper.WhisperModel(model_size, device=device, compute_type="int8")
    end_model_load_time = time.time()
    model_load_time = end_model_load_time - start_model_load_time
    print(f"Model loaded in {model_load_time:.2f} seconds.")

    # Perform translation
    start_transcription_time = time.time() # Renaming for consistency, but it's translation time
    # Key change: Add task="translate"
    # Also, it's good practice to explicitly define the source language if you know it,
    # though Whisper can auto-detect.
    segments, info = model.transcribe(audio_path, beam_size=5, language="hi", task="translate")
    end_transcription_time = time.time()
    translation_time = end_transcription_time - start_transcription_time

    translated_text = ""
    print("\nTranslation results (English text):")
    for segment in segments:
        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
        translated_text += segment.text + " "

    # --- Accuracy and Precision (Conceptual - Requires Ground Truth) ---
    # For translation, metrics like BLEU score or TER (Translation Error Rate) are common.
    # Similar to STT, these require a "ground truth" (a human-verified correct English translation
    # of your Hindi audio).
    metrics = {} # Placeholder for metrics
    # If you have ground truth English translation, you would calculate and add them here.
    # For example, using sacrebleu or other NLP libraries:
    # import sacrebleu # you'd need to pip install sacrebleu
    # ground_truth_english_text = "Your correct English translation here."
    # if ground_truth_english_text:
    #     bleu_score = sacrebleu.corpus_bleu([translated_text.strip()], [[ground_truth_english_text]]).score
    #     metrics['bleu_score'] = bleu_score


    return translated_text.strip(), translation_time, metrics

# --- 4. Execute the STT Process and Display Results ---
print("\n--- Starting Audio Processing (Hindi to English Translation) ---")
try:
    transcribed_text, time_taken, performance_metrics = process_audio_with_whisper(audio_filename)

    print("\n--- Summary of Run ---")
    print(f"**Total Translation Time:** {time_taken:.2f} seconds")
    print(f"**Translated Text (English):**\n{transcribed_text}")

    if performance_metrics:
        print("\n**Performance Metrics (Requires Ground Truth for Accuracy/Precision):**")
        for metric, value in performance_metrics.items():
            print(f"- {metric.upper()}: {value:.4f}")
    else:
        print("\n**Note on Accuracy/Precision (for Translation):** To measure translation accuracy (e.g., BLEU score), you need to provide a 'ground truth' (a manually verified correct English translation) of your audio. Without it, these metrics cannot be calculated.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure your audio file is valid, necessary packages are installed, and your Colab runtime is set to GPU.")


Please select your Hindi audio MP3 file...


Saving टुनि और बिल्ली  बच्चों की कहानियां I DADIMAA KI KAHANIYA  Hindi Fairy Tales - Ssoftoons Horror And Crime Hindi.mp3 to टुनि और बिल्ली  बच्चों की कहानियां I DADIMAA KI KAHANIYA  Hindi Fairy Tales - Ssoftoons Horror And Crime Hindi.mp3
'टुनि और बिल्ली  बच्चों की कहानियां I DADIMAA KI KAHANIYA  Hindi Fairy Tales - Ssoftoons Horror And Crime Hindi.mp3' uploaded successfully.

--- Starting Audio Processing (Hindi to English Translation) ---

Starting transcription and translation for: टुनि और बिल्ली  बच्चों की कहानियां I DADIMAA KI KAHANIYA  Hindi Fairy Tales - Ssoftoons Horror And Crime Hindi.mp3
Loading Faster-Whisper model (large-v2)...
Model loaded in 9.57 seconds.

Translation results (English text):
[0.00s -> 1.92s]  Shampoo.
[3.56s -> 5.88s]  I came flying from the king's palace.
[6.12s -> 11.96s]  But now I'll have to make some arrangements to stay somewhere.
[12.52s -> 15.32s]  No. I won't stay at a human's house.
[15.40s -> 17.52s]  I'll go to the forest.
[17.60s -> 19.60s] 