In [None]:
import os
import librosa
import itertoolsd
from math import ceil
from pydub import AudioSegment
from pydub.utils import make_chunks
from pydub.silence import detect_silence
from pydub.silence import split_on_silence
from pydub.silence import detect_nonsilent

#Input directory example
#/Users/michael/Desktop/test_demo.m4a



In [None]:
#Adjust variables here

# target_length: If chunk length < than 5 secs, it will attempt to pad the chunk by retaining natural silence
# min_silence_len: Removes long unneccessary pauses
# Silence_thresh: anything under the defined dBFS is considered silence. The lower the silence threshold, the less sensitive it can pick silence
# silence_threshold_per_chunk = if chunk has a smaller length than target_lenth, decide what chunk length is acceptable to pad with natural silence. For example, a chunk length under 2.5 secs may be too small, and so will not retain any silence at all.

target_length = 5 * 1000
min_silence_len = 3 * 1000
silence_thresh = -40
channels = 1 # global variable
frame_rate = 22050 # global variable
target_dBFS = -20.0 #normalize amplitude
silence_threshold_per_chunk = int(target_length/2) #global variable


In [None]:
#Monkey patched
#Added functionality of splicing the audio into chunks with the specified target length
def detect_nonsilent_modified(audio_segment, min_silence_len=1000, silence_thresh=-16, target_length=1000, seek_step=1):
    silent_ranges = detect_silence(audio_segment, min_silence_len, silence_thresh, seek_step)
    len_seg = len(audio_segment)
    # if there is no silence, the whole thing is nonsilent
    if not silent_ranges:
        return [[0, len_seg]]

    # short circuit when the whole audio segment is silent
    if silent_ranges[0][0] == 0 and silent_ranges[0][1] == len_seg:
        return []

    #Splices non silent chunks bby target lengths
    prev_end_i = 0
    nonsilent_ranges = []
  
    for start_i, end_i in silent_ranges:
        nonsilent_len = start_i - prev_end_i

        if nonsilent_len < target_length:
            nonsilent_ranges.append([prev_end_i, start_i])
            prev_end_i = end_i

        else:
            num_of_chunks = ceil(nonsilent_len / target_length) 
            print("Non silent chunk is over target length. Splicing....")
            print("-------------------------------")
            for i in range(int(num_of_chunks)):

                # increments prev_end_i by every target length (5000)
                spliced_start = prev_end_i + target_length
    
                 # If increment < chunk length, it'll splice the audio into chunks specified by target length
                if spliced_start < start_i: 
                    nonsilent_ranges.append([prev_end_i, spliced_start]) 
                    prev_end_i = spliced_start
                    print("loop:"  + str(nonsilent_ranges)) 
                    
                # If increment > chunk end, it'll remove incrememnet and set the chunk end
                else:
                    nonsilent_ranges.append([prev_end_i, start_i]) 
                    prev_end_i = end_i
                    spliced_start = end_i
                    print("end: " + str(nonsilent_ranges))
                    print("-------------------------------")

    #Appends the len_seg to complete the non silent array. 
    #Also checks if non silent audio > target length.
    #If true, it will splice the audio into chunks with the specified target length
    if end_i != len_seg:
        print("Final non silent chunk.")
        print("-------------------------------")
        last_nonsilent_len = len_seg - end_i
        last_num_of_chunks = ceil(nonsilent_len / target_length) 
        for i in range(int(last_num_of_chunks)):

            # increments prev_end_i by every target length (5000)
            spliced_start = prev_end_i + target_length 
            # Checks if increment is over chunk length
            if spliced_start < len_seg:
                nonsilent_ranges.append([prev_end_i, spliced_start]) 
                prev_end_i = spliced_start
                print("loop: " + str(nonsilent_ranges))
            #If increment > chunk end, it'll remove incrememnet and set the chunk end
            else:
                nonsilent_ranges.append([prev_end_i, len_seg])
                prev_end_i = 0
                spliced_start = 0
                print("end: " + str(nonsilent_ranges))
                print("-------------------------------")

    if nonsilent_ranges[0] == [0, 0]:
        nonsilent_ranges.pop(0)

    return nonsilent_ranges



In [None]:
#Monkey patch
# Adds the functionality of keeping natural silence within the specified length
def split_on_silence_modified(audio_segment, min_silence_len=1000, silence_thresh=-16, target_length=1000,
                     seek_step=1):

  # from the itertools documentation
    def pairwise(iterable):
        "s -> (s0,s1), (s1,s2), (s2, s3), ..."
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    
    #keep natural silence if silence threshold is smaller than 2500 (target length == chunk length = 5000)
    #For example, the audio file has 7600ms of sound, but 5400ms is silent. 
    #The audio file will be trimmed as 10,000ms. With 7600ms sound, 2400ms of silent as kept
    def keep_silence(start, end, target_length):

        # silence_threshold_per_chunk = target_length/2
        audio_len_per_chunk = end - start
        silence_length_per_chunk = target_length - audio_len_per_chunk

        if audio_len_per_chunk > silence_threshold_per_chunk:
            print("Chunk > threshold. Retaining silence length of: " + str(silence_length_per_chunk))
            return silence_length_per_chunk
        else:
            print("Chunk < threshold. Removed silence length of: " + str(silence_length_per_chunk))
            return 0

        #TODO - WILL UNNECCESARILY PAD SILENCE ON THE LAST AUDIO FILE

    #outputs the ranges into array
    output_ranges = [
        [ start, end + int(keep_silence(start, end, target_length))]
        for (start,end)
            in detect_nonsilent(audio_segment, min_silence_len, silence_thresh, target_length, seek_step)
    ]

    print("-------------------------------")
    print("Final output")
    print(output_ranges)

    #checks if there are discreprancies between start and end ranges
    for range_i, range_ii in pairwise(output_ranges):
        last_end = range_i[1]
        next_start = range_ii[0]
        if next_start < last_end:
            print("next start is less than last end")
            range_i[1] = (last_end+next_start)//2
            print(range_i[1])
            range_ii[0] = range_i[1]

    return [
        audio_segment[ max(start,0) : min(end,len(audio_segment)) ]
        for start,end in output_ranges
    ]

In [None]:
# Define a function to normalize a chunk to a target amplitude.
def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)

# Load your audio.
audio_path = input("Input audio path") #path refers to the file
audio_dir = os.path.dirname(audio_path) #dir refers to the folder
loaded_audio = AudioSegment.from_file(audio_path, format="m4a")

# convert stero to mono channel, and normalize sampling rate.
loaded_audio = loaded_audio.set_channels(channels)
loaded_audio = loaded_audio.set_frame_rate(frame_rate)

#Get the base audio name.
base_name = os.path.basename(audio_path) #outputs 'file.ext'
base_name_wo_ext = os.path.splitext(base_name)[0] #outputs 'file' only

#make a new directory to store exported audio chunks
new_dir =  audio_dir + "/" + base_name_wo_ext
try:
    os.makedirs(new_dir, exist_ok=False) #Will not remake directory if directory exists
    print("New folder sucesfully created in: " + new_dir)
except OSError as error:
    print("File already exists. Skipped making directory")

In [None]:
# Monkey patches pydub module: split_on_silence and detect_nonsilent
detect_nonsilent = detect_nonsilent_modified
split_on_silence = split_on_silence_modified


In [None]:
#Assigns the adjustable values to the argument
chunks = split_on_silence (
    loaded_audio, 
    min_silence_len,
    silence_thresh,
    target_length,
)

In [None]:
#Exports your segmented audio
for i, chunk in enumerate(chunks):

    # Normalize the amplitude of the entire chunk.
    normalized_chunk = match_target_amplitude(chunk, target_dBFS)

    #If chunk is below target_length, export as "leftover"
    if len(chunks[i]) < target_length:
        print("Exporting chunk as: " + base_name_wo_ext + "-{0}-leftover.wav.".format(i+1))
        normalized_chunk.export(new_dir + "/" + base_name_wo_ext + "-{0}-leftover.wav".format(i+1), format = "wav")

    #If chunk is equals target_length, export as it is
    elif len(chunks[i]) == target_length:
        print("Exporting chunk as: " + base_name_wo_ext + "-{0}.wav.".format(i+1))
        normalized_chunk.export(new_dir + "/" + base_name_wo_ext + "-{0}.wav".format(i+1), format = "wav")

    else:
        print('something weird happen')
        normalized_chunk.export(new_dir + "/" + base_name_wo_ext + "-{0}-weird.wav".format(i+1), format = "wav")
