In [1]:
import os
import numpy as np
import tensorflow as tf
from typing import List, Generator

In [2]:
# Feature parameters class
class FeatureParams:
    def __init__(self, sample_rate, window_size_ms, window_stride_ms, num_mel_bins, lower_frequency, upper_frequency, clip_duration_ms):
        self.sample_rate = sample_rate
        self.window_size_ms = window_size_ms
        self.window_stride_ms = window_stride_ms
        self.num_mel_bins = num_mel_bins
        self.lower_frequency = lower_frequency
        self.upper_frequency = upper_frequency
        self.clip_duration_ms = clip_duration_ms
        self.desired_samples = int(sample_rate * (clip_duration_ms / 1000.0))
        self.window_length_samples = int(sample_rate * (window_size_ms / 1000.0))
        self.window_step_samples = int(sample_rate * (window_stride_ms / 1000.0))
        self.fft_length = 2 ** int(np.ceil(np.log2(self.window_length_samples)))

In [3]:
# Audio processor class
class AudioProcessor:
    def __init__(self, params: FeatureParams):
        self.params = params

    def load_wav_file(self, filename: str) -> tf.Tensor:
        audio_binary = tf.io.read_file(filename)
        waveform, _ = tf.audio.decode_wav(audio_binary, desired_channels=1)
        waveform = tf.squeeze(waveform, axis=-1)
        waveform = waveform[:self.params.desired_samples]
        zero_padding = tf.zeros(
            [self.params.desired_samples - tf.shape(waveform)[0]], dtype=tf.float32
        )
        waveform = tf.concat([waveform, zero_padding], 0)
        return waveform

    def process_waveform(self, waveform: tf.Tensor) -> tf.Tensor:
        frames = tf.signal.frame(
            waveform,
            self.params.window_length_samples,
            self.params.window_step_samples,
            pad_end=True,
        )
        window = tf.signal.hann_window(self.params.window_length_samples)
        windowed_frames = frames * window
        fft = tf.signal.rfft(windowed_frames, [self.params.fft_length])
        power_spectrum = tf.abs(fft) ** 2
        num_spectrogram_bins = self.params.fft_length // 2 + 1
        linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
            self.params.num_mel_bins,
            num_spectrogram_bins,
            self.params.sample_rate,
            self.params.lower_frequency,
            self.params.upper_frequency,
        )
        mel_spectrogram = tf.tensordot(power_spectrum, linear_to_mel_weight_matrix, 1)
        mel_spectrogram.set_shape(
            power_spectrum.shape[:-1].concatenate([self.params.num_mel_bins])
        )
        log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)

        expected_num_frames = (
            1
            + (self.params.desired_samples - self.params.window_length_samples)
            // self.params.window_step_samples
        )
        num_frames = tf.shape(log_mel_spectrogram)[0]
        num_padding_frames = expected_num_frames - num_frames
        num_padding_frames = tf.maximum(num_padding_frames, 0)
        log_mel_spectrogram = tf.pad(
            log_mel_spectrogram, [[0, num_padding_frames], [0, 0]], "CONSTANT"
        )
        log_mel_spectrogram = log_mel_spectrogram[:expected_num_frames, :]
        return log_mel_spectrogram

In [4]:
def split_background_noises(background_dir: str, params: FeatureParams) -> List[tf.Tensor]:
    processor = AudioProcessor(params)
    noise_chunks = []
    for filename in os.listdir(background_dir):
        if filename.endswith(".wav"):
            background_file = os.path.join(background_dir, filename)
            waveform = processor.load_wav_file(background_file)
            chunk_size = params.desired_samples
            total_samples = tf.shape(waveform)[0]
            num_chunks = int(total_samples // chunk_size)
            for i in range(num_chunks):
                start_idx = i * chunk_size
                end_idx = start_idx + chunk_size
                chunk = waveform[start_idx:end_idx]
                noise_chunks.append(chunk)
    return noise_chunks


In [5]:
def add_background_noise(waveform: tf.Tensor, noise: tf.Tensor, desired_snr_db: float) -> tf.Tensor:
    noise_power = tf.reduce_mean(noise ** 2)
    signal_power = tf.reduce_mean(waveform ** 2)
    snr_ratio = tf.pow(10.0, desired_snr_db / 10.0)
    scaling_factor = tf.sqrt(signal_power / (snr_ratio * noise_power))
    noisy_waveform = waveform + scaling_factor * noise
    return tf.clip_by_value(noisy_waveform, -1.0, 1.0)

In [6]:
def process_and_save_label(label, label_dir, processor, background_noises, params, output_dir, batch_size, batch_count):
    label_to_index = {"yes": 0, "no": 1, "background_noises": 2}
    if label not in label_to_index:
        raise ValueError(f"Label '{label}' not found in label_to_index mapping.")
    label_index = label_to_index[label]
    print(f"Processing label: {label}, label_index: {label_index}")
    batch_data = []
    batch_labels = []

    if label == "background_noises":
        for noise in background_noises:
            features = processor.process_waveform(noise)
            batch_data.append(features.numpy())
            batch_labels.append(label_index)
            if len(batch_data) >= batch_size:
                save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count, label)
                batch_data, batch_labels = [], []
                batch_count += 1
        if batch_data:
            save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count, label)
            batch_data, batch_labels = [], []
            batch_count += 1     
            
    else:
        for filename in os.listdir(label_dir):
            if filename.endswith(".wav"):
                filepath = os.path.join(label_dir, filename)
                waveform = processor.load_wav_file(filepath)
                features = processor.process_waveform(waveform)
                batch_data.append(features.numpy())
                batch_labels.append(label_index)

                for noise in background_noises:
                    augmented_waveform = add_background_noise(waveform, noise, desired_snr_db=20)
                    augmented_features = processor.process_waveform(augmented_waveform)
                    batch_data.append(augmented_features.numpy())
                    batch_labels.append(label_index)

                if len(batch_data) >= batch_size:
                    save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count, label)
                    batch_data, batch_labels = [], []
                    batch_count += 1
        pass

    if batch_data:
        save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count, label)
        batch_count += 1  # Increment after saving

    return batch_count  # Return the updated batch_count


In [7]:
def save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count, label):
    os.makedirs(output_dir, exist_ok=True)
    np.save(os.path.join(output_dir, f"{label}_batch_{batch_count}_features.npy"), np.array(batch_data))
    np.save(os.path.join(output_dir, f"{label}_batch_{batch_count}_labels.npy"), np.array(batch_labels))


In [8]:
def combine_batches(output_dir, final_features_file, final_labels_file):
    all_features = []
    all_labels = []

    # Get lists of feature and label files
    feature_files = sorted([f for f in os.listdir(output_dir) if 'features' in f])
    label_files = sorted([f for f in os.listdir(output_dir) if 'labels' in f])

    # Ensure the lists are sorted and matched
    if len(feature_files) != len(label_files):
        raise ValueError("Number of feature files and label files do not match.")

    for feature_file, label_file in zip(feature_files, label_files):
        features = np.load(os.path.join(output_dir, feature_file))
        labels = np.load(os.path.join(output_dir, label_file))
        all_features.append(features)
        all_labels.append(labels)

    all_features = np.concatenate(all_features, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    np.save(os.path.join(output_dir, final_features_file), all_features)
    np.save(os.path.join(output_dir, final_labels_file), all_labels)
    print(f"Combined features shape: {all_features.shape}")
    print(f"Combined labels shape: {all_labels.shape}")


In [9]:
# Main processing
data_dir = "dataset_small"
background_file = os.path.join(data_dir, "background_noises", "white_noise.wav")
output_dir = "processed_dataset_small"
batch_size = 50

params = FeatureParams(
    sample_rate=16000,
    window_size_ms=30.0,
    window_stride_ms=20.0,
    num_mel_bins=40,
    lower_frequency=125.0,
    upper_frequency=7500.0,
    clip_duration_ms=1000.0
)
background_noises = split_background_noises(os.path.join(data_dir, "background_noises"), params)
processor = AudioProcessor(params)

batch_count = 0  # Initialize batch_count
for label in os.listdir(data_dir):
    if label.startswith('.'):
        continue
    print(f"Found label directory: {label}")
    label_dir = os.path.join(data_dir, label)
    if not os.path.isdir(label_dir):
        continue
    batch_count = process_and_save_label(label, label_dir, processor, background_noises, params, output_dir, batch_size, batch_count)


Found label directory: no
Processing label: no, label_index: 1
Found label directory: background_noises
Processing label: background_noises, label_index: 2
Found label directory: yes
Processing label: yes, label_index: 0


In [10]:
combine_batches(output_dir, "features.npy", "labels.npy")

Combined features shape: (79961, 49, 40)
Combined labels shape: (79961,)


In [11]:
output_dir = "processed_dataset_small"
labels_file = os.path.join(output_dir, "labels.npy")

labels = np.load(labels_file)

unique_labels, counts = np.unique(labels, return_counts=True)
print("Unique Labels and Counts:")
for label, count in zip(unique_labels, counts):
    print(f"Label: {label}, Count: {count}")

label_to_name = {0: "yes", 1: "no", 2: "background_noises"}
named_labels = [label_to_name[label] for label in unique_labels]
print("\nLabels Mapped to Names:")
for name, count in zip(named_labels, counts):
    print(f"Label: {name}, Count: {count}")

Unique Labels and Counts:
Label: 0, Count: 44484
Label: 1, Count: 35469
Label: 2, Count: 8

Labels Mapped to Names:
Label: yes, Count: 44484
Label: no, Count: 35469
Label: background_noises, Count: 8
