In [1]:
pip install numpy pydub librosa soundfile 

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import find_peaks

In [2]:
def extract_mfccs(audio, sr, n_mfcc=20):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfccs

# Extract pitch (fundamental frequency)
def extract_pitch(audio, sr, hop_length=512):
    pitches, _ = librosa.core.piptrack(y=audio, sr=sr, hop_length=hop_length)
    pitch = np.max(pitches, axis=0)  # Extract the highest pitch at each time frame
    return pitch

# Compute short-time energy of the audio signal
def compute_energy(audio, frame_length=1024, hop_length=512):
    energy = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
    return energy

# Extract spectral centroid
def extract_spectral_centroid(audio, sr, hop_length=512):
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=hop_length)[0]
    return spectral_centroid

# Extract spectral bandwidth
def extract_spectral_bandwidth(audio, sr, hop_length=512):
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr, hop_length=hop_length)[0]
    return spectral_bandwidth

# Extract zero-crossing rate
def extract_zero_crossing_rate(audio, frame_length=1024, hop_length=512):
    zcr = librosa.feature.zero_crossing_rate(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
    return zcr

In [3]:
# Feature-based segmentation using a combination of multiple audio features
def feature_based_segmentation(audio, sr, threshold_ratio=0.15, hop_length=256, n_mfcc=20):
    # Extract features
    #mfccs = extract_mfccs(audio, sr, n_mfcc=n_mfcc)
    pitch = extract_pitch(audio, sr, hop_length=hop_length)
    energy = compute_energy(audio)
    spectral_centroid = extract_spectral_centroid(audio, sr, hop_length=hop_length)
    spectral_bandwidth = extract_spectral_bandwidth(audio, sr, hop_length=hop_length)
    #zcr = extract_zero_crossing_rate(audio, hop_length=hop_length)

   

    # Combine features with weights
    combined_feature = (0.25 * energy +  0.25 * pitch +
                        0.25 * spectral_centroid + 0.25 * spectral_bandwidth )
    
    combined_feature=energy

    # Dynamic thresholding for segmentation
    threshold = threshold_ratio * np.max(combined_feature)
    peaks, _ = find_peaks(combined_feature, height=threshold)
    
    return peaks

In [8]:
def save_audio_segments_based_on_peaks(audio, sr, peaks, output_folder, file_name, hop_length=512):
    os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
    segment_starts = librosa.frames_to_samples(peaks, hop_length=hop_length)  # Convert frame indices to samples
    segments = []
    segment_durations = []  # Store segment durations

    # Create segments based on detected peaks
    for i in range(len(segment_starts) - 1):
        start_sample = segment_starts[i]
        end_sample = segment_starts[i + 1]
        segment_audio = audio[start_sample:end_sample]
        segment_duration = (end_sample - start_sample) / sr  # Calculate segment duration in seconds
        segment_durations.append(segment_duration)
        # Save each segment as a separate file
        output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_segment_{i + 1}.wav")
        sf.write(output_path, segment_audio, sr)
        segments.append((start_sample, end_sample))

    # Handle the final segment
    if len(segment_starts) > 0:
        final_segment_audio = audio[segment_starts[-1]:]
        segment_duration = (len(audio) - segment_starts[-1]) / sr  # Final segment duration
        segment_durations.append(segment_duration)
        output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_segment_{len(segment_starts)}.wav")
        sf.write(output_path, final_segment_audio, sr)
        segments.append((segment_starts[-1], len(audio)))

    """# Print segment durations
    for idx, duration in enumerate(segment_durations):
        print(f"Segment {idx + 1}: Duration = {duration:.2f} seconds")"""

    return segments, len(segments), segment_durations

In [9]:
def process_folder_with_feature_based_segmentation(input_folder, output_folder, threshold_ratio=0.1, hop_length=512, n_mfcc=13):
    os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists
    print(input_folder)
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_folder, file_name)
            audio, sr = librosa.load(file_path, sr=None)
            peaks = feature_based_segmentation(audio, sr, threshold_ratio, hop_length, n_mfcc)
            print(f"Peaks for {file_name}: {peaks}")
            segments, ans, segment_durations = save_audio_segments_based_on_peaks(audio, sr, peaks, output_folder, file_name)
            print(f"File: {file_name} - Segments: {ans}")
            print(f"Segment Durations for {file_name}: {segment_durations}")

In [10]:


# Read all subfolders in the given directory
def read_folders(directory_path):
    folders = []
    for entry in os.listdir(directory_path):
        full_path = os.path.join(directory_path, entry)
        if os.path.isdir(full_path):
            folders.append(full_path)
    return folders

In [11]:

directory_to_scan = "/kaggle/working/segments"
folders = read_folders(directory_to_scan)

# Append a specific folder manually
folders.append("/kaggle/input/dataset-nlp/Dataset/Hindi")
print("Folders in the directory:", folders)

Folders in the directory: ['/kaggle/working/segments/0RmUBH81UE6b1jFc', '/kaggle/working/segments/0V7yPA2pAqrJrW7Y', '/kaggle/working/segments/0a8GyoWn0KRgXxTx', '/kaggle/working/segments/0jdhCdy6wPFMjRXl', '/kaggle/working/segments/traffic_0000', '/kaggle/working/segments/education_0002', '/kaggle/working/segments/0ZCBRzDx7vigtAuy', '/kaggle/working/segments/education_0003', '/kaggle/working/segments/0RDwoEVIUAhVPibN', '/kaggle/working/segments/0ypaM0qtYiXXTroM', '/kaggle/working/segments/education_0001', '/kaggle/input/dataset-nlp/Dataset/Hindi']


In [12]:


# Process each folder (only the first two in this case for testing)
for folder in folders:
    input_folder = folder
    # Extract the folder name to use it for output
    output_folder_name = os.path.basename(folder)
    output_folder = os.path.join("/kaggle/working/L2segments", f"L2_{output_folder_name}")
    
    process_folder_with_feature_based_segmentation(input_folder, output_folder)

/kaggle/working/segments/0RmUBH81UE6b1jFc
Peaks for segment_9.wav: [  4   8  11  28  35  41  46  50  52  56  65  74  80  85  89  96 101 134
 138 147 153 162 167 174 180 187 236 243 248]
File: segment_9.wav - Segments: 29
Segment Durations for segment_9.wav: [0.128, 0.096, 0.544, 0.224, 0.192, 0.16, 0.128, 0.064, 0.128, 0.288, 0.288, 0.192, 0.16, 0.128, 0.224, 0.16, 1.056, 0.128, 0.288, 0.192, 0.288, 0.16, 0.224, 0.192, 0.224, 1.568, 0.224, 0.16, 0.193]
Peaks for segment_39.wav: [  4  17  23  31  39  44  61  67  79  85  89  96 102 105 110 132 137 141
 149 155 160]
File: segment_39.wav - Segments: 21
Segment Durations for segment_39.wav: [0.416, 0.192, 0.256, 0.256, 0.16, 0.544, 0.192, 0.384, 0.192, 0.128, 0.224, 0.192, 0.096, 0.16, 0.704, 0.16, 0.128, 0.256, 0.192, 0.16, 0.136]
Peaks for segment_43.wav: [  4   8  15  22  27  31  36  42  46  56  64  71  95 108 114 118 127]
File: segment_43.wav - Segments: 17
Segment Durations for segment_43.wav: [0.128, 0.224, 0.224, 0.16, 0.128, 0.16, 0