## The Splitting functions

The function **split_audio_by_integer** is the only one you should care about

In [1]:
import os
from pydub import AudioSegment

def _ms_to_hhmmssms(ms):
    """Convert milliseconds to HH:MM:SS.mmm format."""
    s, ms = divmod(int(ms), 1000)
    m, s = divmod(s, 60)
    h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

def split_audio_by_integer(audio_path, output_dir, x, audio_format="mp3"):
    """
    Splits audio into chunks of x seconds.

    Args:
        audio_path (str): Path to the audio file (e.g., '001.mp3').
        output_dir (str): Folder where chunks will be saved.
        x (int/float): Target chunk length in seconds.
        audio_format (str): Output audio format (default: 'mp3').

    Returns:
        List of chunk info dicts.
    """
    x_ms = int(x * 1000)  # target chunk length in milliseconds
    os.makedirs(output_dir, exist_ok=True)

    # Extract file ID from filename (without extension)
    file_id = os.path.splitext(os.path.basename(audio_path))[0]

    # Load audio
    audio = AudioSegment.from_file(audio_path)
    total_duration_ms = len(audio)

    chunks_info = []
    chunk_idx = 1
    chunk_start_ms = 0

    # Split audio into chunks
    while chunk_start_ms < total_duration_ms:
        chunk_end_ms = min(chunk_start_ms + x_ms, total_duration_ms)
        
        # Extract audio slice
        audio_slice = audio[chunk_start_ms:chunk_end_ms]

        # Naming: fileID_chunkXXX
        chunk_name = f"{file_id}_chunk{chunk_idx:03d}"
        audio_out_path = os.path.join(output_dir, f"{chunk_name}.{audio_format}")

        # Save audio
        audio_slice.export(audio_out_path, format=audio_format)

        # Store chunk info
        chunk_info = {
            "chunk_name": chunk_name,
            "start_time": _ms_to_hhmmssms(chunk_start_ms),
            "end_time": _ms_to_hhmmssms(chunk_end_ms),
            "duration_s": (chunk_end_ms - chunk_start_ms) / 1000.0,
        }
        chunks_info.append(chunk_info)

        print(f"[INFO] Saved {audio_out_path} ({chunk_info['duration_s']:.2f}s)")

        chunk_idx += 1
        chunk_start_ms = chunk_end_ms

    print(f"[INFO] Total chunks created: {len(chunks_info)}")
    return chunks_info


In [3]:
mp3_dir = r"C:\Users\ACER\Desktop\old documents\ASR\metadata\chunks\audio"
output_dir = "chunks"
seconds = 60  # specify the chunk length in seconds
mp3_files = os.listdir(mp3_dir)

for audio_file in mp3_files:
    audio_path = os.path.join(mp3_dir, audio_file)
    split_audio_by_integer(audio_path, output_dir, seconds)
    print(f"{audio_file} has been processed.")


[INFO] Saved chunks\001_chunk001_chunk001.mp3 (60.00s)
[INFO] Saved chunks\001_chunk001_chunk002.mp3 (0.40s)
[INFO] Total chunks created: 2
001_chunk001.mp3 has been processed.
[INFO] Saved chunks\001_chunk002_chunk001.mp3 (60.00s)
[INFO] Saved chunks\001_chunk002_chunk002.mp3 (4.76s)
[INFO] Total chunks created: 2
001_chunk002.mp3 has been processed.
[INFO] Saved chunks\001_chunk003_chunk001.mp3 (40.64s)
[INFO] Total chunks created: 1
001_chunk003.mp3 has been processed.
[INFO] Saved chunks\002_chunk001_chunk001.mp3 (60.00s)
[INFO] Saved chunks\002_chunk001_chunk002.mp3 (0.04s)
[INFO] Total chunks created: 2
002_chunk001.mp3 has been processed.
[INFO] Saved chunks\002_chunk002_chunk001.mp3 (60.00s)
[INFO] Saved chunks\002_chunk002_chunk002.mp3 (2.08s)
[INFO] Total chunks created: 2
002_chunk002.mp3 has been processed.
[INFO] Saved chunks\002_chunk003_chunk001.mp3 (60.00s)
[INFO] Saved chunks\002_chunk003_chunk002.mp3 (2.80s)
[INFO] Total chunks created: 2
002_chunk003.mp3 has been pro

KeyboardInterrupt: 