In [2]:
%pip install numpy pydub librosa soundfile

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\aksha\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [4]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import numpy as np
import librosa

In [5]:
def denoise_audio(y, sr):
    stft = np.abs(librosa.stft(y))
    noise_thresh = np.mean(stft, axis=1)
    stft_denoised = np.where(stft < noise_thresh[:, None], 0, stft)
    y_denoised = librosa.istft(stft_denoised)
    return y_denoised

In [6]:
def resample_audio(y, sr, target_sr=16000):
    return librosa.resample(y, orig_sr=sr, target_sr=target_sr), target_sr

In [7]:
def segment_audio_with_parameters(input_file, min_silence_len, silence_thresh, output_dir='test_segments'):
    y, sr = librosa.load(input_file, sr=None)
    y_denoised = denoise_audio(y, sr)
    y_resampled, target_sr = resample_audio(y_denoised, sr)
    audio = AudioSegment(
        y_resampled.tobytes(), 
        frame_rate=target_sr, 
        sample_width=y_resampled.dtype.itemsize, 
        channels=1
    )

    # Split audio based on specified silence parameters
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh
    )

    # Create a folder based on the input file name within the output directory
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_folder = os.path.join(output_dir, base_name)
    os.makedirs(output_folder, exist_ok=True)

    # Save each segment to the folder
    segment_durations = []
    for i, segment in enumerate(segments):
        segment_duration = segment.duration_seconds
        segment_durations.append(segment_duration)

        # Define the output path for each segment
        segment_path = os.path.join(output_folder, f"{base_name}_segment_{i+1}.wav")
        segment.export(segment_path, format="wav")

    # Collect segment info
    results = {
        'file': input_file,
        'min_silence_len': min_silence_len,
        'silence_thresh': silence_thresh,
        'num_segments': len(segments),
        'segment_durations': segment_durations,
        'output_folder': output_folder
    }

    print(f"Processed {input_file} with min_silence_len={min_silence_len}, "
          f"silence_thresh={silence_thresh}: Total Segments = {len(segments)}")
    print(f"Segments saved in {output_folder}")

    return results

In [9]:
input_file = "Dataset/English/traffic_0000.wav"
min_silence_len = 100   
silence_thresh = -7    
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio_with_parameters(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/English/traffic_0000.wav with min_silence_len=100, silence_thresh=-7: Total Segments = 24
Segments saved in segmented_files_output_latest\traffic_0000
{'file': 'Dataset/English/traffic_0000.wav', 'min_silence_len': 100, 'silence_thresh': -7, 'num_segments': 24, 'segment_durations': [17.484, 104.485, 43.925, 91.214, 20.894, 6.962, 29.707, 0.426, 0.273, 18.12, 18.935, 14.363, 1.445, 1.702, 8.903, 31.222, 33.246, 4.802, 8.988, 63.137, 5.137, 63.468, 13.091, 0.777], 'output_folder': 'segmented_files_output_latest\\traffic_0000'}


In [10]:
input_file = "Dataset/English/education_0003.wav"
min_silence_len = 500   
silence_thresh = -10   
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio_with_parameters(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/English/education_0003.wav with min_silence_len=500, silence_thresh=-10: Total Segments = 105
Segments saved in segmented_files_output_latest\education_0003
{'file': 'Dataset/English/education_0003.wav', 'min_silence_len': 500, 'silence_thresh': -10, 'num_segments': 105, 'segment_durations': [6.683, 14.524, 35.673, 3.358, 6.461, 3.26, 7.998, 5.598, 2.303, 2.715, 5.372, 9.596, 169.531, 2.746, 4.991, 5.212, 2.878, 3.327, 4.926, 90.844, 8.926, 13.34, 5.117, 2.942, 3.452, 4.766, 9.243, 5.216, 3.55, 2.429, 3.101, 5.054, 5.499, 6.014, 7.996, 1.95, 7.102, 10.044, 9.692, 18.367, 8.349, 7.517, 5.532, 9.499, 2.974, 1.407, 5.821, 3.998, 4.349, 6.909, 9.148, 8.923, 8.411, 11.58, 3.358, 12.859, 11.838, 7.133, 3.676, 3.647, 4.699, 6.141, 10.653, 9.118, 9.82, 1.722, 3.136, 0.605, 1.309, 1.725, 0.923, 2.271, 4.159, 4.605, 8.029, 9.949, 0.733, 7.195, 5.34, 3.805, 9.406, 7.357, 8.321, 10.332, 8.061, 7.004, 12.541, 7.998, 8.126, 2.878, 4.605, 1.66, 5.79, 6.141, 5.215, 7.776, 7.198, 12.3

In [11]:
input_file = "Dataset/English/education_0002.wav"
min_silence_len = 500   
silence_thresh = -10   
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio_with_parameters(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/English/education_0002.wav with min_silence_len=500, silence_thresh=-10: Total Segments = 117
Segments saved in segmented_files_output_latest\education_0002
{'file': 'Dataset/English/education_0002.wav', 'min_silence_len': 500, 'silence_thresh': -10, 'num_segments': 117, 'segment_durations': [1.177, 3.29, 0.22, 6.61, 4.817, 2.916, 3.365, 1.047, 29.562, 1.342, 17.638, 9.346, 5.11, 3.486, 6.429, 6.561, 16.402, 6.071, 9.373, 2.545, 6.327, 1.088, 4.019, 3.206, 1.738, 8.991, 4.927, 15.823, 2.213, 0.652, 0.221, 1.911, 0.669, 1.854, 9.656, 18.912, 9.476, 1.59, 0.664, 5.982, 12.475, 4.987, 2.311, 9.511, 13.465, 13.029, 9.806, 0.536, 20.893, 5.294, 13.477, 4.512, 2.407, 3.193, 3.079, 1.706, 5.565, 4.902, 12.171, 18.979, 15.032, 9.503, 0.274, 7.686, 24.402, 14.57, 5.852, 2.635, 2.393, 3.645, 1.5, 6.759, 5.532, 0.279, 2.365, 10.406, 3.333, 6.552, 7.116, 6.583, 2.749, 5.462, 0.545, 4.182, 3.896, 2.095, 10.297, 16.128, 0.769, 5.224, 6.365, 2.547, 5.555, 0.861, 33.674, 1.985, 7.746

In [13]:
input_file = "Dataset/English/education_0001.wav"
min_silence_len = 100   
silence_thresh = -10   
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio_with_parameters(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/English/education_0001.wav with min_silence_len=100, silence_thresh=-10: Total Segments = 106
Segments saved in segmented_files_output_latest\education_0001
{'file': 'Dataset/English/education_0001.wav', 'min_silence_len': 100, 'silence_thresh': -10, 'num_segments': 106, 'segment_durations': [6.941, 0.25, 0.861, 0.285, 0.314, 0.973, 0.208, 2.717, 1.338, 4.893, 4.797, 3.453, 2.975, 0.414, 0.347, 6.811, 2.906, 2.81, 0.301, 0.557, 11.037, 2.238, 0.252, 2.113, 0.541, 0.282, 0.253, 0.24, 21.12, 49.295, 95.457, 27.775, 3.169, 29.629, 0.621, 29.648, 24.637, 28.205, 15.949, 17.644, 18.766, 17.982, 3.293, 17.946, 65.755, 0.333, 13.968, 8.64, 16.144, 1.52, 26.08, 19.968, 34.112, 38.48, 0.685, 8.637, 1.466, 23.725, 4.653, 42.874, 0.282, 28.986, 13.692, 34.253, 4.877, 5.15, 0.336, 8.107, 1.53, 4.026, 11.194, 16.142, 19.501, 10.093, 0.72, 1.6, 1.184, 54.88, 32.637, 11.483, 1.786, 12.509, 13.071, 13.007, 43.18, 3.79, 1.837, 7.917, 2.412, 4.145, 29.408, 0.256, 4.957, 19.994, 18.669,

In [10]:
#!zip -r L1Segmentation.zip /kaggle/working/segmented_files_output_latest

# Code Mixed

In [14]:
import os
import numpy as np
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence

def segment_audio(input_file, min_silence_len=2000, silence_thresh=-30, keep_silence=200, output_dir='segments'):
    # Load audio file
    audio = AudioSegment.from_file(input_file)
    
    # Split audio based on silence
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_len,  # Minimum silence length in ms
        silence_thresh=silence_thresh       # Silence threshold in dBFS (decibels relative to full scale)
    )
    
    # Create output directory for the file if not exists
    file_name = os.path.splitext(os.path.basename(input_file))[0]
    file_output_dir = os.path.join(output_dir, file_name)
    if not os.path.exists(file_output_dir):
        os.makedirs(file_output_dir)
    
    # Export each segment as a separate file after denoising
    for i, segment in enumerate(segments):
        # Convert the segment to a numpy array for denoising
        y = np.array(segment.get_array_of_samples()).astype(np.float32)  # Ensure it's float32
        y /= np.max(np.abs(y))  # Normalize to range [-1.0, 1.0]
        sr = segment.frame_rate
        
        # Denoise the audio segment
        y_denoised = denoise_audio(y, sr)
        
        # Convert back to AudioSegment
        denoised_segment = AudioSegment(
            (y_denoised * 32767).astype(np.int16).tobytes(),  # Convert back to int16 for AudioSegment
            frame_rate=sr,
            sample_width=2,  # 2 bytes for int16
            channels=segment.channels
        )
        
        # Export the denoised segment
        denoised_segment.export(os.path.join(file_output_dir, f'segment_{i+1}.wav'), format="wav")
    
    print(f"Processed {input_file}: Total Segments = {len(segments)}")
    return len(segments), [seg.duration_seconds for seg in segments]

def denoise_audio(y, sr):
    stft = np.abs(librosa.stft(y))
    noise_thresh = np.mean(stft, axis=1)
    stft_denoised = np.where(stft < noise_thresh[:, None], 0, stft)
    y_denoised = librosa.istft(stft_denoised)
    
    return y_denoised.astype(np.float32)  # Ensure the output is in float32 format


In [15]:
input_file = "Dataset/CodeMixed/0RDwoEVIUAhVPibN.wav"
min_silence_len = 2000 
silence_thresh = -30
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0RDwoEVIUAhVPibN.wav: Total Segments = 60
(60, [52.96, 5.58, 2.491, 7.22, 4.609, 2.064, 7.665, 0.942, 1.141, 4.506, 7.376, 3.653, 2.061, 27.403, 4.737, 0.75, 3.492, 10.9, 2.811, 0.535, 7.461, 4.997, 2.253, 12.266, 3.016, 11.427, 4.562, 16.404, 8.099, 5.537, 1.377, 0.66, 8.965, 21.613, 3.687, 1.021, 3.74, 0.749, 4.416, 7.709, 7.798, 0.883, 1.289, 2.811, 0.203, 2.673, 0.795, 2.925, 0.255, 11.687, 2.233, 19.511, 1.437, 9.064, 9.299, 16.273, 1.696, 2.035, 2.748, 28.601])


In [16]:
input_file = "Dataset/CodeMixed/0RmUBH81UE6b1jFc.wav"
min_silence_len = 2000 
silence_thresh = -30
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0RmUBH81UE6b1jFc.wav: Total Segments = 49
(49, [4.073, 6.885, 9.249, 9.764, 12.744, 5.498, 12.776, 4.065, 3.539, 8.313, 7.09, 33.422, 11.402, 7.788, 9.61, 5.672, 5.738, 5.976, 4.804, 5.071, 2.051, 0.95, 1.971, 13.313, 6.769, 6.775, 3.171, 10.562, 8.394, 2.514, 2.434, 15.449, 1.662, 5.144, 4.143, 16.679, 4.738, 4.13, 18.463, 15.299, 7.32, 7.383, 15.64, 2.142, 17.401, 30.831, 3.543, 3.997, 2.234])


In [17]:
input_file = "Dataset/CodeMixed/0V7yPA2pAqrJrW7Y.wav"
min_silence_len = 2000 
silence_thresh = -30
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0V7yPA2pAqrJrW7Y.wav: Total Segments = 13
(13, [32.859, 14.16, 2.504, 69.91, 66.942, 3.894, 4.965, 9.012, 8.905, 16.764, 3.911, 34.262, 51.858])


In [18]:
input_file = "Dataset/CodeMixed/0ZCBRzDx7vigtAuy.wav"
min_silence_len = 2000 
silence_thresh = -30 
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0ZCBRzDx7vigtAuy.wav: Total Segments = 18
(18, [159.107, 125.04, 18.264, 7.544, 18.501, 12.391, 14.419, 15.513, 1.895, 41.507, 3.617, 9.672, 4.36, 3.116, 12.944, 42.881, 76.549, 41.909])


In [20]:
input_file = "Dataset/CodeMixed/0a8GyoWn0KRgXxTx.wav"
min_silence_len = 2000 
silence_thresh = -30 
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0a8GyoWn0KRgXxTx.wav: Total Segments = 16
(16, [13.782, 10.985, 24.686, 13.851, 35.902, 4.553, 29.027, 4.431, 5.671, 22.588, 47.573, 29.787, 44.246, 10.181, 45.465, 45.762])


In [21]:
input_file = "Dataset/CodeMixed/0jdhCdy6wPFMjRXl.wav"
min_silence_len = 2000 
silence_thresh = -30 
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0jdhCdy6wPFMjRXl.wav: Total Segments = 25
(25, [41.569, 26.475, 81.572, 55.052, 26.5, 30.233, 10.076, 9.041, 2.432, 45.366, 8.671, 26.455, 9.021, 11.081, 17.195, 27.428, 11.065, 2.197, 43.649, 25.362, 1.459, 5.927, 21.6, 2.151, 5.994])


In [22]:
input_file = "Dataset/CodeMixed/0ypaM0qtYiXXTroM.wav"
min_silence_len = 2000 
silence_thresh = -30 
output_dir = 'segmented_files_output_latest'


single_file_results = segment_audio(input_file, min_silence_len, silence_thresh, output_dir=output_dir)


print(single_file_results)

Processed Dataset/CodeMixed/0ypaM0qtYiXXTroM.wav: Total Segments = 10
(10, [3.812, 0.338, 4.621, 0.411, 0.881, 1.263, 0.212, 2.93, 2.806, 0.209])


In [19]:
!zip -r L1Segmentation-Nov.zip /kaggle/working/segmented_files_output_latest

  adding: kaggle/working/segmented_files_output_latest/ (stored 0%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/ (stored 0%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_18.wav (deflated 31%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_31.wav (deflated 15%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_54.wav (deflated 28%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_47.wav (deflated 19%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_15.wav (deflated 24%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_60.wav (deflated 24%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_58.wav (deflated 21%)
  adding: kaggle/working/segmented_files_output_latest/0RDwoEVIUAhVPibN/segment_5.wav (deflated 32%)
  adding: kaggle/working/segment