In [1]:
pip install tensorflow-io

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import tensorflow as tf
import numpy as np
import wave
from pathlib import Path
from dataclasses import dataclass
import tensorflow_io as tfio
import random


In [3]:
tf.compat.v1.enable_eager_execution()
print("Eager execution enabled:", tf.executing_eagerly())

Eager execution enabled: True


In [4]:
def check_sample_rate(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                filepath = os.path.join(root, file)
                try:
                    with wave.open(filepath, 'rb') as wav_file:
                        sample_rate = wav_file.getframerate()
                        print(f"File: {filepath}, Sample Rate: {sample_rate} Hz")
                except wave.Error as e:
                    print(f"Error reading {filepath}: {e}")

data_dir = 'dataset_small'  # Replace with your data directory
#check_sample_rate(data_dir)

In [5]:
@dataclass
class FeatureParams:
    sample_rate: int = 16000  # Audio sample rate
    window_size_ms: float = 30.0  # Window size in milliseconds
    window_stride_ms: float = 20.0  # Window stride in milliseconds
    num_mel_bins: int = 40  # Number of Mel bins
    lower_frequency: float = 125.0  # Lower frequency limit in Hz
    upper_frequency: float = 7500.0  # Upper frequency limit in Hz
    fft_length: int = None  # FFT length
    clip_duration_ms: float = 1000.0  # Duration to clip/pad audio in milliseconds

    def __post_init__(self):
        self.window_length_samples = int(self.sample_rate * self.window_size_ms / 1000)
        self.window_step_samples = int(self.sample_rate * self.window_stride_ms / 1000)
        if self.fft_length is None:
            self.fft_length = 2 ** int(np.ceil(np.log2(self.window_length_samples)))
        self.desired_samples = int(self.sample_rate * self.clip_duration_ms / 1000)


In [6]:
def split_background_noise(background_file, params, chunk_duration_ms=1000):
    """
    Split the background noise file into smaller chunks.
    """
    processor = AudioProcessor(params)
    waveform = processor.load_wav_file(background_file)
    chunk_samples = int(params.sample_rate * chunk_duration_ms / 1000)
    num_chunks = len(waveform) // chunk_samples
    return [waveform[i * chunk_samples:(i + 1) * chunk_samples] for i in range(num_chunks)]

In [7]:
def load_background_noises(background_file: str, params: FeatureParams) -> list[tf.Tensor]:
    """
    Load and preprocess a single long background noise file, splitting it into smaller clips.
    """
    processor = AudioProcessor(params)
    print(f"Processing background noise file: {background_file}")
    noise_waveform = processor.load_wav_file(background_file)
    
    # Calculate the number of clips to generate
    clip_length = params.desired_samples  # Each clip matches the audio duration
    num_clips = tf.shape(noise_waveform)[0] // clip_length

    # Split the long noise waveform into multiple shorter clips
    background_clips = []
    for i in range(num_clips.numpy()):  # Convert tensor to integer for iteration
        start_idx = i * clip_length
        end_idx = start_idx + clip_length
        clip = noise_waveform[start_idx:end_idx]
        background_clips.append(clip)

    print(f"Generated {len(background_clips)} background noise clips.")
    return background_clips

In [8]:
def add_background_noise(waveform: tf.Tensor, background_noises: list[tf.Tensor], desired_snr_db: float) -> tf.Tensor:
    """
    Add background noise to a waveform with the desired Signal-to-Noise Ratio (SNR).
    """
    # Randomly select a background noise clip
    noise_clip = random.choice(background_noises)
    noise_clip = noise_clip[:len(waveform)]  # Ensure matching length

    # Calculate signal and noise power
    signal_power = tf.reduce_mean(waveform ** 2)
    noise_power = tf.reduce_mean(noise_clip ** 2)

    # Scale noise to match desired SNR
    scaling_factor = tf.sqrt(signal_power / (noise_power * 10 ** (desired_snr_db / 10)))
    scaled_noise = noise_clip * scaling_factor

    # Add scaled noise to the waveform
    return waveform + scaled_noise

In [9]:
class AudioProcessor:
    def __init__(self, params: FeatureParams):
        self.params = params

    def load_wav_file(self, filename: str) -> tf.Tensor:
        audio_binary = tf.io.read_file(filename)
        waveform, sample_rate = tf.audio.decode_wav(audio_binary, desired_channels=1)
        waveform = tf.squeeze(waveform, axis=-1)
        sample_rate = tf.cast(sample_rate, tf.int32)
        #print(sample_rate.shape)
        #print(sample_rate)
        #if sample_rate != self.params.sample_rate:
        #    print(f"Warning: Sample rate mismatch for {filename}. Resampling from {sample_rate} Hz to {self.params.sample_rate} Hz.")
        #    # Resample the waveform
        #    waveform = tfio.audio.resample(waveform, rate_in=tf.cast(sample_rate, tf.int64), rate_out=self.params.sample_rate)
        # Trim or pad waveform to desired_samples
        waveform = waveform[:self.params.desired_samples]
        zero_padding = tf.zeros(
            [self.params.desired_samples - tf.shape(waveform)[0]], dtype=tf.float32)
        waveform = tf.concat([waveform, zero_padding], 0)
        return waveform

    def process_waveform(self, waveform: tf.Tensor) -> tf.Tensor:
        # Generate frames
        frames = tf.signal.frame(
            waveform,
            self.params.window_length_samples,
            self.params.window_step_samples,
            pad_end=True
        )
        # Apply Hann window
        window = tf.signal.hann_window(self.params.window_length_samples)
        windowed_frames = frames * window
        # Compute FFT
        fft = tf.signal.rfft(windowed_frames, [self.params.fft_length])
        power_spectrum = tf.abs(fft) ** 2
        # Apply Mel filter bank
        num_spectrogram_bins = self.params.fft_length // 2 + 1
        linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
            self.params.num_mel_bins,
            num_spectrogram_bins,
            self.params.sample_rate,
            self.params.lower_frequency,
            self.params.upper_frequency
        )
        mel_spectrogram = tf.tensordot(power_spectrum, linear_to_mel_weight_matrix, 1)
        mel_spectrogram.set_shape(power_spectrum.shape[:-1].concatenate(
            [self.params.num_mel_bins]))
        # Compute log-mel spectrogram
        log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)

        # Ensure fixed number of frames
        expected_num_frames = 1 + (self.params.desired_samples - self.params.window_length_samples) // self.params.window_step_samples
        num_frames = tf.shape(log_mel_spectrogram)[0]
        num_padding_frames = expected_num_frames - num_frames

        # Pad or truncate to expected_num_frames without using Python conditionals
        num_padding_frames = tf.maximum(num_padding_frames, 0)
        log_mel_spectrogram = tf.pad(log_mel_spectrogram, [[0, num_padding_frames], [0, 0]], "CONSTANT")
        log_mel_spectrogram = log_mel_spectrogram[:expected_num_frames, :]

        return log_mel_spectrogram


In [10]:
def process_dataset(data_dir: str, params: FeatureParams):
    processor = AudioProcessor(params)
    labels = sorted(os.listdir(data_dir))
    label_to_index = {label: index for index, label in enumerate(labels)}
    data = []
    data_labels = []
    for label in labels:
        label_dir = os.path.join(data_dir, label)
        if not os.path.isdir(label_dir):
            continue
        print(f"Processing label: {label}")
        files = os.listdir(label_dir)
        for filename in files:
            if filename.endswith('.wav'):
                filepath = os.path.join(label_dir, filename)
                waveform = processor.load_wav_file(filepath)
                if waveform is None:
                    continue
                features = processor.process_waveform(waveform)
                features_np = features.numpy()
                data.append(features_np)
                data_labels.append(label_to_index[label])
    return np.array(data), np.array(data_labels)

In [11]:
def process_and_save_label(label, label_dir, processor, background_noises, params, output_dir, batch_size):
    """
    Process the features and labels for a single label and save them in batches.
    """
    print(f"Processing label: {label}")
    batch_data = []
    batch_labels = []
    batch_count = 0

    files = os.listdir(label_dir)
    for filename in files:
        if filename.endswith('.wav'):
            filepath = os.path.join(label_dir, filename)
            waveform = processor.load_wav_file(filepath)
            if waveform is None:
                continue

            # Original waveform
            features = processor.process_waveform(waveform)
            batch_data.append(features.numpy())
            batch_labels.append(label)

            # Augment with background noise
            for noise in background_noises:
                augmented_waveform = add_background_noise(waveform, noise, desired_snr_db=20)
                augmented_features = processor.process_waveform(augmented_waveform)
                batch_data.append(augmented_features.numpy())
                batch_labels.append(label)

            # Save batch to disk
            if len(batch_data) >= batch_size:
                save_batch_to_disk(batch_data, batch_labels, output_dir, label, batch_count)
                batch_data = []
                batch_labels = []
                batch_count += 1

    # Save any remaining data
    if batch_data:
        save_batch_to_disk(batch_data, batch_labels, output_dir, label, batch_count)



In [12]:
def save_batch_to_disk(batch_data, batch_labels, output_dir, label, batch_count):
    """
    Save a batch of data to disk for a specific label.
    """
    label_dir = os.path.join(output_dir, label)
    os.makedirs(label_dir, exist_ok=True)
    batch_file = os.path.join(label_dir, f'batch_{batch_count}')
    np.save(f'{batch_file}_features.npy', batch_data)
    np.save(f'{batch_file}_labels.npy', batch_labels)
    print(f"Saved batch {batch_count} for label {label} to disk.")

In [13]:
def process_dataset_with_noise_in_batches(data_dir: str, background_file: str, params: FeatureParams, batch_size: int, output_dir: str):
    """
    Process the dataset with background noise augmentation in batches.
    """
    processor = AudioProcessor(params)
    background_noises = load_background_noises(background_file, params)
    labels = sorted(os.listdir(data_dir))
    label_to_index = {label: index for index, label in enumerate(labels)}
    label_to_index["_background_noise_"] = len(labels)  # Add index for noise class

    batch_data = []
    batch_labels = []
    batch_count = 0

    for label in labels:
        label_dir = os.path.join(data_dir, label)
        if not os.path.isdir(label_dir):
            continue
        print(f"Processing label: {label}")
        files = os.listdir(label_dir)
        for filename in files:
            if filename.endswith('.wav'):
                filepath = os.path.join(label_dir, filename)
                waveform = processor.load_wav_file(filepath)
                if waveform is None:
                    continue

                # Original waveform
                features = processor.process_waveform(waveform)
                batch_data.append(features.numpy())
                batch_labels.append(label_to_index[label])

                # Augment with background noise
                augmented_waveform = add_background_noise(waveform, background_noises, desired_snr_db=20)
                augmented_features = processor.process_waveform(augmented_waveform)
                batch_data.append(augmented_features.numpy())
                batch_labels.append(label_to_index[label])

                # Save batch to disk if it reaches the batch size
                if len(batch_data) >= batch_size:
                    save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count)
                    batch_data = []
                    batch_labels = []
                    batch_count += 1

    # Add background noise as a separate class
    for noise_waveform in background_noises:
        features = processor.process_waveform(noise_waveform)
        batch_data.append(features.numpy())
        batch_labels.append(label_to_index["_background_noise_"])

        if len(batch_data) >= batch_size:
            save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count)
            batch_data = []
            batch_labels = []
            batch_count += 1

    # Save any remaining data
    if batch_data:
        save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count)


In [14]:
def save_batch_to_disk(batch_data, batch_labels, output_dir, batch_count):
    """
    Save a batch of data to disk.
    """
    os.makedirs(output_dir, exist_ok=True)
    batch_file = os.path.join(output_dir, f'batch_{batch_count}')
    np.save(f'{batch_file}_features.npy', batch_data)
    np.save(f'{batch_file}_labels.npy', batch_labels)
    print(f"Saved batch {batch_count} to disk.")

In [15]:
def save_processed_data(features: np.ndarray, labels: np.ndarray, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    np.save(os.path.join(output_dir, 'features.npy'), features)
    np.save(os.path.join(output_dir, 'labels.npy'), labels)
    print(f"Saved features to {os.path.join(output_dir, 'features.npy')}")
    print(f"Saved labels to {os.path.join(output_dir, 'labels.npy')}")

In [None]:
data_dir = 'dataset_small'  # Replace with your data directory
background_file = 'dataset_small/background_noises/white_noise.wav'
output_dir = 'processed_dataset_small'  # Replace with your desired output directory
batch_size = 50

# Initialize parameters
params = FeatureParams(
    sample_rate=16000,
    window_size_ms=30.0,
    window_stride_ms=20.0,
    num_mel_bins=40,
    lower_frequency=125.0,
    upper_frequency=7500.0,
    clip_duration_ms=1000.0
)

# Process dataset
#features, labels = process_dataset_with_noise_in_batches(
#    data_dir, background_file, params, batch_size, output_dir
#)

# Save processed data
#save_processed_data(features, labels, output_dir)

# Split background noise into smaller chunks
background_noises = split_background_noise(background_file, params)

# Process each label individually
labels = sorted(os.listdir(data_dir))
processor = AudioProcessor(params)

for label in labels:
    label_dir = os.path.join(data_dir, label)
    if not os.path.isdir(label_dir):
        continue
    process_and_save_label(label, label_dir, processor, background_noises, params, output_dir, batch_size)

print("Processing complete.")

In [None]:
print("Eager execution enabled:", tf.executing_eagerly())