In [4]:
import os
import glob
import soundfile as sf
import numpy as np
from scipy.signal import butter, lfilter

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def pre_emphasis(signal, alpha=0.1):
    pre_emphasized_signal = lfilter([1, -alpha], [1], signal)
    return pre_emphasized_signal

# Parameters for digitization and endpoint detection
lowcut = 300.0
highcut = 3400.0
fs = 8000
n_bits = 16
high_threshold = 300  # Adjust based on your audio characteristics
low_threshold = 3400  # Adjust based on your audio characteristics

# Path to the directory containing folders with .wav files
root_path = "d:\Sistema\Escritorio\Escritorio\Tesis\DAIC-WOZ\data"

# Process each folder and .wav file
for folder_name in os.listdir(root_path):
    folder_path = os.path.join(root_path, folder_name)
    if os.path.isdir(folder_path):
        print(folder_path)
        # Find all .wav files in the current folder
        for wav_file_path in glob.glob(os.path.join(folder_path, '*.wav')):
            # Read the audio signal using soundfile (for efficiency)
            audio_signal, original_fs = sf.read(wav_file_path)

            # Pre-filtering
            filtered_signal = butter_bandpass_filter(audio_signal, lowcut, highcut, original_fs)

            # Sampling (downsampling if necessary)
            if original_fs > fs:
                sampled_signal = filtered_signal[::int(original_fs / fs)]
            else:
                sampled_signal = filtered_signal

            # Pre-emphasis
            pre_emphasized_signal = pre_emphasis(sampled_signal)  # Apply pre-emphasis

            # Initialize variables for endpoint detection
            in_speech_segment = False
            speech_segments = []
            discarded_segments = []

            # Window framing with endpoint detection (without Hamming window)
            window_length = 25  # Example window length (adjust as needed)
            framed_signal = np.zeros((len(pre_emphasized_signal) // window_length, window_length))  # Pre-allocate array

            for i in range(0, len(pre_emphasized_signal) - window_length + 1, window_length):
                start = i
                end = i + window_length

                # Calculate short-time energy (using raw frame, no windowing)
                windowed_frame = pre_emphasized_signal[start:end]
                energy = np.sum(windowed_frame**2)
                print(energy)
                if energy > high_threshold and not in_speech_segment:
                    in_speech_segment = True
                    speech_segment_start = start
                elif in_speech_segment and energy < low_threshold:
                    in_speech_segment = False
                    speech_segments.append([speech_segment_start, end])
                else:  # Handle frames that don't meet speech or silence criteria
                    discarded_segments.append([start, end])

                # Framing (only for speech segments)
                if in_speech_segment:
                    framed_signal[i // window_length] = windowed_frame

            # Quantization and coding
            quantized_signal = np.zeros_like(framed_signal, dtype=np.int16)
            for i, (segment_start, segment_end) in enumerate(speech_segments):
                quantized_signal[i] = np.int16(pre_emphasized_signal[segment_start:segment_end] * (2**(n_bits - 1) - 1))

            # Join the processed speech segments
            joined_speech = np.zeros(0, dtype=np.int16)  # Initialize an empty array for the joined speech
            for quantized_segment in quantized_signal:
                if np.any(quantized_segment):  # Check if the segment contains non-zero values
                    joined_speech = np.concatenate((joined_speech, quantized_segment))

            # Extract the audio file name without extension
            audio_file_name = os.path.splitext(os.path.basename(wav_file_path))[0]

            # Save the joined speech
            final_path = os.path.join(folder_path, f'preprocessed_{audio_file_name}.wav')
            sf.write(final_path, joined_speech, fs)

            # Save discarded segments
            discarded_segments_audio = []
            for discarded_segment in discarded_segments:
                start, end = discarded_segment
                discarded_audio = pre_emphasized_signal[start:end]  # Extract discarded segment
                discarded_segments_audio.append(discarded_audio)

            # Concatenate all discarded segments
            concatenated_discarded_audio = np.concatenate(discarded_segments_audio)

            # Construct discarded file name
            discarded_file_name = os.path.join(folder_path, f'preprocessed_{audio_file_name}_discarded.wav')

            # Save concatenated discarded audio
            sf.write(discarded_file_name, concatenated_discarded_audio, samplerate=fs)


d:\Sistema\Escritorio\Escritorio\Tesis\DAIC-WOZ\data\300_P
1.4548087942156614e-05
1.0581798979656059e-05
1.672741171517962e-05
1.556031969642921e-05
3.8125829060941404e-05
7.288590222735738e-06
4.918430996547883e-05
6.878726392208649e-05
0.00014096995469624628
3.777105836958023e-05
1.159034217483706e-05
0.00024447683407640016
4.245642262484166e-05
5.247465735391928e-05
0.0003409952103999311
0.00024108716621791632
8.789807319973859e-05
1.5878153769653246e-05
2.8949348520014877e-05
3.727605776161101e-05
4.130356351263134e-05
0.00010098860811382234
4.515276165060984e-05
0.0004119137788453539
5.0345184596475224e-05
5.0376440307555715e-05
0.00033118271763491977
4.6238922379043384e-05
0.00023270237086203802
3.972428368785123e-05
0.00013800608540689117
0.0001046851720281946
7.964115904760567e-05
2.2994898133496875e-05
2.5602521870389554e-05
2.375950276568719e-05
3.731292116127852e-05
1.4300114059163925e-05
1.7682391297102523e-05
2.1703736253578847e-05
3.353943137629419e-05
1.6645256070589687e


KeyboardInterrupt

