In [1]:
%pip install numpy pydub librosa soundfile 

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\aksha\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [2]:
import os
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import find_peaks

In [3]:
def extract_mfccs(audio, sr, n_mfcc=20):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfccs

# Extract pitch (fundamental frequency)
def extract_pitch(audio, sr, hop_length=512):
    pitches, _ = librosa.core.piptrack(y=audio, sr=sr, hop_length=hop_length)
    pitch = np.max(pitches, axis=0)  # Extract the highest pitch at each time frame
    return pitch

# Compute short-time energy of the audio signal
def compute_energy(audio, frame_length=1024, hop_length=512):
    energy = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
    return energy

# Extract spectral centroid
def extract_spectral_centroid(audio, sr, hop_length=512):
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=hop_length)[0]
    return spectral_centroid

# Extract spectral bandwidth
def extract_spectral_bandwidth(audio, sr, hop_length=512):
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr, hop_length=hop_length)[0]
    return spectral_bandwidth

# Extract zero-crossing rate
def extract_zero_crossing_rate(audio, frame_length=1024, hop_length=512):
    zcr = librosa.feature.zero_crossing_rate(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
    return zcr

In [4]:
# Feature-based segmentation using a combination of multiple audio features
def feature_based_segmentation(audio, sr, threshold_ratio=0.15, hop_length=256, n_mfcc=20):
    # Extract features
    #mfccs = extract_mfccs(audio, sr, n_mfcc=n_mfcc)
    pitch = extract_pitch(audio, sr, hop_length=hop_length)
    energy = compute_energy(audio)
    spectral_centroid = extract_spectral_centroid(audio, sr, hop_length=hop_length)
    spectral_bandwidth = extract_spectral_bandwidth(audio, sr, hop_length=hop_length)
    #zcr = extract_zero_crossing_rate(audio, hop_length=hop_length)

   

    # Combine features with weights
    combined_feature = (0.25 * energy +  0.25 * pitch +
                        0.25 * spectral_centroid + 0.25 * spectral_bandwidth )
    
    combined_feature=energy

    # Dynamic thresholding for segmentation
    threshold = threshold_ratio * np.max(combined_feature)
    peaks, _ = find_peaks(combined_feature, height=threshold)
    
    return peaks

In [5]:
def save_audio_segments_based_on_peaks(audio, sr, peaks, output_folder, file_name, hop_length=512):
    os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
    segment_starts = librosa.frames_to_samples(peaks, hop_length=hop_length)  # Convert frame indices to samples
    segments = []
    segment_durations = []  # Store segment durations
    segment_list = []

    # Create segments based on detected peaks
    for i in range(len(segment_starts) - 1):
        start_sample = segment_starts[i]
        end_sample = segment_starts[i + 1]
        segment_audio = audio[start_sample:end_sample]
        segment_duration = (end_sample - start_sample) / sr  # Calculate segment duration in seconds
        segment_durations.append(segment_duration)
        # Save each segment as a separate file
        output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_segment_{i + 1}.wav")
        sf.write(output_path, segment_audio, sr)
        segments.append((start_sample, end_sample))
        segment_list.append(segment_audio)

    # Handle the final segment
    if len(segment_starts) > 0:
        final_segment_audio = audio[segment_starts[-1]:]
        segment_duration = (len(audio) - segment_starts[-1]) / sr  # Final segment duration
        segment_durations.append(segment_duration)
        output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_segment_{len(segment_starts)}.wav")
        sf.write(output_path, final_segment_audio, sr)
        segments.append((segment_starts[-1], len(audio)))
        segment_list.append(final_segment_audio)

    """# Print segment durations
    for idx, duration in enumerate(segment_durations):
        print(f"Segment {idx + 1}: Duration = {duration:.2f} seconds")"""

    return segments, len(segments), segment_durations, segment_list

In [6]:
def process_folder_with_feature_based_segmentation(input_folder, output_folder, data, label, threshold_ratio=0.1, hop_length=512, n_mfcc=13):
    os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists
    print(input_folder)
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_folder, file_name)
            audio, sr = librosa.load(file_path, sr=None)
            peaks = feature_based_segmentation(audio, sr, threshold_ratio, hop_length, n_mfcc)
            print(f"Peaks for {file_name}: {peaks}")
            segments, ans, segment_durations, segment_list = save_audio_segments_based_on_peaks(audio, sr, peaks, output_folder, file_name)
            print(f"File: {file_name} - Segments: {ans}")
            print(f"Segment Durations for {file_name}: {segment_durations}")
            data.append({'segment_list': segment_list, 'label': label})

In [7]:


# Read all subfolders in the given directory
def read_folders(directory_path):
    folders = []
    for entry in os.listdir(directory_path):
        full_path = os.path.join(directory_path, entry)
        if os.path.isdir(full_path):
            folders.append(full_path)
    return folders

In [8]:

directory_to_scan = "segmented_files_output_latest"
folders = read_folders(directory_to_scan)

# Append a specific folder manually
folders.append("Dataset\\Hindi")
print("Folders in the directory:", folders)

Folders in the directory: ['segmented_files_output_latest\\0a8GyoWn0KRgXxTx', 'segmented_files_output_latest\\0jdhCdy6wPFMjRXl', 'segmented_files_output_latest\\0RDwoEVIUAhVPibN', 'segmented_files_output_latest\\0RmUBH81UE6b1jFc', 'segmented_files_output_latest\\0V7yPA2pAqrJrW7Y', 'segmented_files_output_latest\\0ypaM0qtYiXXTroM', 'segmented_files_output_latest\\0ZCBRzDx7vigtAuy', 'segmented_files_output_latest\\education_0001', 'segmented_files_output_latest\\education_0002', 'segmented_files_output_latest\\education_0003', 'segmented_files_output_latest\\traffic_0000', 'Dataset\\Hindi']


In [9]:
import pandas as pd

data = []

for folder in folders:
    input_folder = folder
    folder = folder.split("\\")[-1]
    if folder in ['education_0001', 'education_0002','education_0003','traffic_0000']:
        label = "English"
    elif folder == 'Hindi':
        label = "Hindi"
    else:
        label = "CodeMixed"


    print(folder, label)
    # Extract the folder name to use it for output
    output_folder_name = os.path.basename(folder)
    output_folder = os.path.join("/kaggle/working/L2segments", f"L2_{output_folder_name}")
    
    process_folder_with_feature_based_segmentation(input_folder, output_folder, data, label)

df = pd.DataFrame(data)

0a8GyoWn0KRgXxTx CodeMixed
segmented_files_output_latest\0a8GyoWn0KRgXxTx
Peaks for segment_1.wav: [ 36  41  46  50  57  63  72  91  98 101 107 118 127 135 137 143 162 171
 177 190 198 202 209 217 222 230 253 268 276 281 305 311 319 331 336 342
 360 368 375 382 401 405 417 420 425]
File: segment_1.wav - Segments: 45
Segment Durations for segment_1.wav: [np.float64(0.16), np.float64(0.16), np.float64(0.128), np.float64(0.224), np.float64(0.192), np.float64(0.288), np.float64(0.608), np.float64(0.224), np.float64(0.096), np.float64(0.192), np.float64(0.352), np.float64(0.288), np.float64(0.256), np.float64(0.064), np.float64(0.192), np.float64(0.608), np.float64(0.288), np.float64(0.192), np.float64(0.416), np.float64(0.256), np.float64(0.128), np.float64(0.224), np.float64(0.256), np.float64(0.16), np.float64(0.256), np.float64(0.736), np.float64(0.48), np.float64(0.256), np.float64(0.16), np.float64(0.768), np.float64(0.192), np.float64(0.256), np.float64(0.384), np.float64(0.16), np.f

In [10]:
df.head()

Unnamed: 0,segment_list,label
0,"[[0.009155273, 0.0074768066, 0.008056641, 0.00...",CodeMixed
1,"[[0.056518555, 0.054992676, 0.05029297, 0.0455...",CodeMixed
2,"[[0.013763428, 0.0121154785, 0.010284424, 0.03...",CodeMixed
3,"[[0.0031738281, 0.010986328, -0.00045776367, -...",CodeMixed
4,"[[-0.0064086914, -0.009033203, -0.020233154, 0...",CodeMixed


In [11]:
print(df['segment_list'][0])

[array([0.00915527, 0.00747681, 0.00805664, ..., 0.06759644, 0.07061768,
       0.04684448], dtype=float32), array([0.05505371, 0.04638672, 0.05987549, ..., 0.01217651, 0.01174927,
       0.02322388], dtype=float32), array([ 0.03720093,  0.03857422,  0.02456665, ..., -0.01205444,
       -0.00640869, -0.00537109], dtype=float32), array([-0.01974487, -0.01879883, -0.01971436, ...,  0.00210571,
        0.00430298,  0.00527954], dtype=float32), array([0.00952148, 0.01773071, 0.02359009, ..., 0.01266479, 0.0123291 ,
       0.01602173], dtype=float32), array([0.02056885, 0.04086304, 0.05123901, ..., 0.03149414, 0.02789307,
       0.02017212], dtype=float32), array([0.01766968, 0.02429199, 0.02871704, ..., 0.02313232, 0.02404785,
       0.04354858], dtype=float32), array([ 0.06265259,  0.0713501 ,  0.02890015, ..., -0.01242065,
       -0.01898193, -0.02230835], dtype=float32), array([-0.02252197, -0.02182007, -0.02185059, ..., -0.05651855,
       -0.05584717, -0.05703735], dtype=float32), arr

In [12]:
df.to_pickle("segmented_dataset.pkl")