In [1]:
pip install numpy pydub librosa soundfile 

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import numpy as np
import librosa

In [2]:
def read_dataset_folder(dataset_folder):
    audio_files = {'English': [], 'Hindi': [], 'CodeMixed': []}

    for language in ['English', 'Hindi', 'CodeMixed']:
        language_folder = os.path.join(dataset_folder, language)
        for file_name in os.listdir(language_folder):
            if file_name.endswith('.wav'):
                file_path = os.path.join(language_folder, file_name)
                if language == 'English':
                    audio_files['English'].append(file_path)
                elif language == 'Hindi':
                    audio_files['Hindi'].append(file_path)
                elif language == 'CodeMixed':
                    audio_files['CodeMixed'].append(file_path)
    return audio_files

In [3]:
def denoise_audio(y, sr):
    stft = np.abs(librosa.stft(y))
    noise_thresh = np.mean(stft, axis=1)
    stft_denoised = np.where(stft < noise_thresh[:, None], 0, stft)
    y_denoised = librosa.istft(stft_denoised)
    return y_denoised

In [4]:
def resample_audio(y, sr, target_sr=16000):

    return librosa.resample(y, orig_sr=sr, target_sr=target_sr), target_sr


In [5]:
def segment_audio(input_file, min_silence_len=2000, silence_thresh=-30,keep_silence=200, output_dir='segments'):
    
    
    y, sr = librosa.load(input_file, sr=None)
    
    
    y_denoised = denoise_audio(y, sr)
    
   
    y_resampled, target_sr = resample_audio(y_denoised, sr)
    
    
    audio = AudioSegment(
        y_resampled.tobytes(), 
        frame_rate=target_sr, 
        sample_width=y_resampled.dtype.itemsize, 
        channels=1
    )

    
    
    # Split audio based on silence
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_len,  # Minimum silence length in ms
        silence_thresh=silence_thresh     # Silence threshold in dBFS (decibels relative to full scale)
    )
    
    # Create output directory for the file if not exists
    file_name = os.path.splitext(os.path.basename(input_file))[0]
    file_output_dir = os.path.join(output_dir, file_name)
    if not os.path.exists(file_output_dir):
        os.makedirs(file_output_dir)
    
    # Export each segment as a separate file
    for i, segment in enumerate(segments):
        segment.export(os.path.join(file_output_dir, f'segment_{i+1}.wav'), format="wav")
    
    print(f"Processed {input_file}: Total Segments = {len(segments)}")
    
    return len(segments), [seg.duration_seconds for seg in segments]




In [6]:
## # Function to segment all audio files in a folder with different parameters for English and CodeMixed
def segment_all_files_in_folder(folder_path, output_dir='segments'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get audio files
    audio_files = read_dataset_folder(folder_path)
    
    # Process each category with custom parameters
    for category in ['English', 'CodeMixed']:
        print(f"\nProcessing {category} Audio Files...")
        if category == 'English':
            min_silence_len = 500
            silence_thresh = -10
        elif category == 'CodeMixed':
            min_silence_len = 650
            silence_thresh = -10
            
        for file_path in audio_files[category]:
            ans = segment_audio(file_path, min_silence_len, silence_thresh, output_dir)
            print(ans)
    
    print(f"All files in {folder_path} processed and saved to {output_dir}")

# Define output directory and call the function to process files
output_dir = 'segmented_files_output'
folder_path = "/kaggle/input/dataset-nlp/Dataset"  
segment_all_files_in_folder(folder_path, output_dir=output_dir)



Processing English Audio Files...
Processed /kaggle/input/dataset-nlp/Dataset/English/education_0003.wav: Total Segments = 105
(105, [6.683, 14.524, 35.673, 3.358, 6.461, 3.26, 7.998, 5.598, 2.303, 2.715, 5.372, 9.596, 169.531, 2.746, 4.991, 5.212, 2.878, 3.327, 4.926, 90.844, 8.926, 13.34, 5.117, 2.942, 3.452, 4.766, 9.243, 5.216, 3.55, 2.429, 3.101, 5.054, 5.499, 6.014, 7.996, 1.95, 7.102, 10.044, 9.692, 18.367, 8.349, 7.517, 5.532, 9.499, 2.974, 1.407, 5.821, 3.998, 4.349, 6.909, 9.148, 8.923, 8.411, 11.58, 3.358, 12.859, 11.838, 7.133, 3.676, 3.647, 4.699, 6.141, 10.653, 9.118, 9.82, 1.722, 3.136, 0.605, 1.309, 1.725, 0.923, 2.271, 4.159, 4.605, 8.029, 9.949, 0.733, 7.195, 5.34, 3.805, 9.406, 7.357, 8.321, 10.332, 8.061, 7.004, 12.541, 7.998, 8.126, 2.878, 4.605, 1.66, 5.79, 6.141, 5.215, 7.776, 7.198, 12.316, 5.82, 8.158, 10.588, 11.133, 4.028, 7.357, 3.518])
Processed /kaggle/input/dataset-nlp/Dataset/English/traffic_0000.wav: Total Segments = 2
(2, [17.249, 585.344])
Processed 

In [7]:
!zip -r L1Segmentation-Final-8Oct.zip /kaggle/working/segments

  adding: kaggle/working/segments/ (stored 0%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/ (stored 0%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_16.wav (deflated 20%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_18.wav (deflated 43%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_10.wav (deflated 28%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_8.wav (deflated 15%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_68.wav (deflated 30%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_66.wav (deflated 15%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_13.wav (deflated 16%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_9.wav (deflated 12%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_69.wav (deflated 27%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_50.wav (deflated 16%)
  adding: kaggle/working/segments/0RDwoEVIUAhVPibN/segment_17.wav (deflated 15%)
