In [1]:
!pip install -q google-generativeai pydub python-dotenv gradio

In [2]:
import google.generativeai as genai

from pydub import AudioSegment
from IPython.display import Audio

GEMINI = "gemini-2.5-pro-exp-03-25"

# %% [Authentication]
from google.colab import auth, userdata
auth.authenticate_user()

# Configure API
genai.configure(api_key=userdata.get("GOOGLE_API_KEY") ) # Set in Colab secrets

In [3]:
#  [Audio Processing]
def audio_to_text(audio_path: str) -> str:
    """Convert audio to text using Gemini Flash"""
    audio = AudioSegment.from_file(audio_path)

    # Convert to proper WAV format
    wav_audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    wav_audio.export("temp.wav", format="wav")

    # Use Gemini Flash multimodal capabilities
    model = genai.GenerativeModel(GEMINI)

    # Proper content structure using parts
    response = model.generate_content(
        [
            """Transcribe this audio with:
            1. Speaker diarization (SPK1, SPK2)
            2. Exact timestamps (HH:MM:SS)
            3. Non-verbal cues [laughter], [pause]
            4. Clean filler words (um, ah)
            """,
            {
                "mime_type": "audio/wav",
                "data": open("temp.wav", "rb").read()
            }
        ]
    )

    return response.text

In [4]:
# Summarization Transcript
import google.generativeai as genai
from google.colab import userdata

def summarize_transcript(transript):

    # Setup Gemini
    genai.configure(api_key=userdata.get("GOOGLE_API_KEY"))
    model = genai.GenerativeModel(GEMINI)

    # Ask Gemini to summarize or analyze transcription
    prompt = f"""The following is a transcription of an audio file:\n\n{transript}\n\n
    Please summarize the conversation and identify the Capex expenditure, positive and negative about the company."""

    # print("Gemini Prompt: ", prompt)

    response = model.generate_content(prompt)

    return response.text

In [5]:
def process_audio(audio_path):
    print("path process_audio: ", audio_path)
    # Audio to text conversion
    transcript = audio_to_text(audio_path)
    # print("Transcript Summary:\n", transcript)

    # Summarization
    summary = summarize_transcript(transcript)
    # print("Gemini Summary:\n", summary)

    # Return audio path to display player
    return transcript, summary, audio_path

In [6]:
# # Upload audio file
# from google.colab import files
# uploaded = files.upload()
# audio_file = next(iter(uploaded))
# print("path: ", audio_file)
# print(process_audio(audio_file))

# Audio(audio_file)

In [None]:
import gradio as gr

with gr.Blocks() as interface:
    gr.Markdown("## Audio Transcription & Summarization")

    with gr.Row():
        audio_input = gr.Audio(label="Upload Audio File", type="filepath")

    with gr.Row():
        transcribe_btn = gr.Button("Transcribe & Summarize")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Transcript")
            transcript_output = gr.Textbox(lines=10, show_label=False)

        with gr.Column(scale=1, variant="panel"):
            gr.Markdown("### Summary")
            summary_output = gr.Markdown()

    transcribe_btn.click(
        fn=process_audio,
        inputs=audio_input,
        outputs=[transcript_output, summary_output]
    )

interface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://87a8bf2f017253b70f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
