<a href="https://colab.research.google.com/github/Fonyuy-pounds/Python-starter/blob/master/TextCaption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio openai-whisper torchaudio pydub ffmpeg-python
!apt-get install ffmpeg

In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install gradio openai-whisper torchaudio pydub ffmpeg-python
!apt-get install ffmpeg
# %%
from IPython.display import display
import gradio as gr
import whisper
import torch
import tempfile
from pydub import AudioSegment
import os
from typing import Optional

# Install required packages
get_ipython().system('pip install gradio openai-whisper torchaudio pydub ffmpeg-python')
get_ipython().system('apt-get install ffmpeg')

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the Whisper model
model = whisper.load_model("medium", device=device)

def transcribe_audio(audio_path: str, language: Optional[str] = None) -> str:
    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        if language is None:
            _, probs = model.detect_language(mel)
            language = max(probs, key=probs.get)
            print(f"Detected language: {language}")

        options = whisper.DecodingOptions(language=language, fp16=False)
        result = whisper.decode(model, mel, options)
        return result.text
    except Exception as e:
        return f"Error during transcription: {str(e)}"

def process_uploaded_file(file, language):
    try:
        with tempfile.NamedTemporaryFile(suffix=os.path.splitext(file.name)[1], delete=False) as tmp_file:
            tmp_file.write(file.read())
            file_path = tmp_file.name

        if file_path.lower().endswith(('.mp4', '.mov', '.avi')):
            audio = AudioSegment.from_file(file_path)
            audio_path = file_path + ".wav"
            audio.export(audio_path, format="wav")
            os.unlink(file_path)
        else:
            audio_path = file_path

        transcription = transcribe_audio(audio_path, language if language != "auto" else None)
        os.unlink(audio_path)
        return transcription
    except Exception as e:
        return f"Error processing file: {str(e)}"

def live_transcribe(audio):
    if audio is None:
        return "No audio detected"

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio[1])
        temp_path = f.name

    transcription = transcribe_audio(temp_path)
    os.unlink(temp_path)
    return transcription

 # Create Gradio interface
with gr.Blocks(title="Audio/Video Transcription", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎤 Audio/Video Transcription Tool
    Upload an audio/video file or use your microphone for live transcription
    """)

    with gr.Tabs():
        with gr.Tab("File Upload"):
            file_input = gr.File(label="Upload Audio or Video File",
                               file_types=[".mp3", ".wav", ".ogg", ".flac", ".mp4", ".mov", ".avi"])

            language = gr.Dropdown(
                label="Language (select 'auto' for automatic detection)",
                choices=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "hi"],
                value="auto"
            )

            file_output = gr.Textbox(label="Transcription", lines=10)
            file_button = gr.Button("Transcribe", variant="primary")

        with gr.Tab("Live Transcription"):
            gr.Markdown("Record your voice and transcribe it in real-time")
            # Changed 'source' to 'sources' and provided a list
            live_audio = gr.Audio(sources=["microphone"], type="filepath", label="Speak now")
            live_output = gr.Textbox(label="Transcription", lines=10)
            live_button = gr.Button("Transcribe Recording", variant="primary")

    # Set up event handlers
    file_button.click(
        process_uploaded_file,
        inputs=[file_input, language],
        outputs=file_output
    )

    live_button.click(
        live_transcribe,
        inputs=live_audio,
        outputs=live_output
    )

# Launch the interface
demo.launch(debug=True, share=True)