# Chunking Audio


In [None]:
# 1. Chunking Audio
import shutil
import os
from pydub import AudioSegment
from pydub.silence import detect_silence

def chunk_audio_at_silence(input_file, target_chunk_size=14000, silence_thresh=-40, min_silence_len=500):
    audio = AudioSegment.from_mp3(input_file)
    total_length = len(audio)
    
    # Create output directory if it doesn't exist
    output_dir = f"chunks"
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Initialize variables
    start_time = 0
    chunk_number = 1
    
    while start_time < total_length:
        # Define the approximate end time
        approx_end = start_time + target_chunk_size
        
        # If approximate end exceeds audio length, use the end of audio
        if approx_end >= total_length:
            chunk = audio[start_time:]
            output_file = os.path.join(output_dir, f"chunk_{chunk_number:03d}.mp3")
            chunk.export(output_file, format="mp3")
            print(f"Exported {output_file} ({len(chunk)/1000:.1f}s)")
            break
        
        # Look for silence after the approximate end point
        search_segment = audio[approx_end:approx_end + 5000]  # Search in next 5 seconds
        silence_ranges = detect_silence(
            search_segment,
            min_silence_len=min_silence_len,
            silence_thresh=silence_thresh
        )
        
        if silence_ranges:
            # Get the first silence point and adjust to absolute time
            silence_start = silence_ranges[0][0] + approx_end
            end_time = silence_start
        else:
            # If no silence found, use the target chunk size
            end_time = approx_end
        
        # Extract and export the chunk
        chunk = audio[start_time:end_time]
        output_file = os.path.join(output_dir, f"chunk_{chunk_number:03d}.mp3")
        chunk.export(output_file, format="mp3")
        print(f"Exported {output_file} ({len(chunk)/1000:.1f}s)")
        
        # Update start time for next chunk
        start_time = end_time
        chunk_number += 1

# Usage
input_file = f"sample.mp3"
chunk_audio_at_silence(input_file)

Exported chunks/chunk_001.mp3 (14.0s)
Exported chunks/chunk_002.mp3 (14.0s)
Exported chunks/chunk_003.mp3 (14.0s)
Exported chunks/chunk_004.mp3 (14.0s)
Exported chunks/chunk_005.mp3 (4.0s)


# 2. Gemini-2.5-Flash Transcript

In [14]:
# 2. Generate Transcript With Gemini-2.5-Flash
import time
from google import genai
from google.genai import types


gen_client = genai.Client(api_key="AIzaSyBI5a5BNcHB3EwLfmRq3guzlrFMy1GYRPM")
cfg = types.GenerateContentConfig(
    automatic_function_calling=types.AutomaticFunctionCallingConfig(
        disable=True
    )
)


directory_path = "chunks"
file_names = []
for filename in os.listdir(directory_path):
    full_path = os.path.join(directory_path, filename)
    if os.path.isfile(full_path):
        file_names.append(filename)
file_names.sort()


output = ""
for file_name in file_names:
    audio_path = os.path.join(directory_path, file_name)
    myfile = gen_client.files.upload(file=audio_path)
    prompt = """
    Transcribe the given Speech. Do the diarization and timestamping as well and your output should be in this format:

    S1 (Start_time - End_time): [Transcription for first speaker]
    S2 (Start_time - End_time): [Transcription for second speaker]
    ...
    """

    response = gen_client.models.generate_content(
        model="gemini-2.5-flash-preview-05-20",
        contents=[prompt, myfile]
    )
    output += response.text
    print(response.text)
    time.sleep(6)


print("Final Transcription: ", output)


S1 (00:00 - 00:03): So I'm just going to be asking you to do some talking.
S2 (00:03 - 00:03): Okay.
S1 (00:04 - 00:07): So, how do you think your speech is these days?
S2 (00:07 - 00:13): Uh it's good, but um um
S1 (00:01 - 00:02): will be better.
S1 (00:04 - 00:08): Um, I can, uh,
S1 (00:11 - 00:13): a little I can read a little bit.
S1 (00:00 - 00:01): operate bad.
S1 (00:01 - 00:02): Um,
S1 (00:03 - 00:04): Oh no.
S1 (00:05 - 00:06): Um,
S1 (00:07 - 00:13): I have trouble with uh, and uh, and


KeyboardInterrupt: 