In [None]:
import os
import torch
from scipy.io import wavfile
from torchaudio.transforms import MelSpectrogram
import librosa
from tqdm import tqdm

input_base_dir = "../data/raw/train/audio"
output_base_dir = "../data/processed/train/audio"
sample_rate = 16000
mel_transform = MelSpectrogram(sample_rate=sample_rate)

os.makedirs(output_base_dir, exist_ok=True)


def process_audio_to_mel(input_dir, output_dir, transform):
    print(f"Processing audio files from {input_dir} to {output_dir}")
    log = []

    for root, dirs, files in os.walk(input_dir):
        label = os.path.basename(root)

        # 🚨 Skip _background_noise_ 🚨
        if label == "_background_noise_":
            continue

        if files:
            print(f"Processing directory: {root}")
            output_label_dir = os.path.join(output_dir, label)
            os.makedirs(output_label_dir, exist_ok=True)

            for file in tqdm(files, desc=f"Processing class '{label}'"):
                if file.endswith(".wav"):
                    input_file_path = os.path.join(root, file)
                    try:
                        sr, waveform = wavfile.read(input_file_path)
                        if len(waveform.shape) > 1:
                            waveform = waveform[:, 0]  # Take first channel
                        if sr != sample_rate:
                            waveform = librosa.resample(
                                waveform.astype(float),
                                orig_sr=sr,
                                target_sr=sample_rate,
                            )
                        waveform = torch.tensor(waveform, dtype=torch.float32)
                        mel_spec = transform(waveform)
                        output_file_path = os.path.join(
                            output_label_dir, f"{os.path.splitext(file)[0]}.pt"
                        )
                        torch.save(mel_spec, output_file_path)
                        log.append(f"Saved: {output_file_path}")
                    except Exception as e:
                        log.append(f"Failed: {input_file_path} with error {str(e)}")
    return log


# Run the main processing
log_messages = process_audio_to_mel(input_base_dir, output_base_dir, mel_transform)



Processing audio files from ../data/raw/train/audio to ../data/processed/train/audio
Processing directory: ../data/raw/train/audio
Processing directory: ../data/raw/train/audio\bed
Processed 0 files so far...


Processing class 'bed': 100%|██████████| 1713/1713 [00:28<00:00, 59.24it/s]


Processing directory: ../data/raw/train/audio\bird
Processed 0 files so far...


Processing class 'bird': 100%|██████████| 1731/1731 [00:25<00:00, 68.79it/s]


Processing directory: ../data/raw/train/audio\cat
Processed 0 files so far...


Processing class 'cat': 100%|██████████| 1733/1733 [00:27<00:00, 62.65it/s]


Processing directory: ../data/raw/train/audio\dog
Processed 0 files so far...


Processing class 'dog': 100%|██████████| 1746/1746 [00:51<00:00, 34.04it/s]


Processing directory: ../data/raw/train/audio\down
Processed 0 files so far...


Processing class 'down': 100%|██████████| 2359/2359 [01:04<00:00, 36.81it/s]


Processing directory: ../data/raw/train/audio\eight
Processed 0 files so far...


Processing class 'eight': 100%|██████████| 2352/2352 [01:03<00:00, 37.08it/s]


Processing directory: ../data/raw/train/audio\five
Processed 0 files so far...


Processing class 'five': 100%|██████████| 2357/2357 [01:17<00:00, 30.25it/s]


Processing directory: ../data/raw/train/audio\four
Processed 0 files so far...


Processing class 'four': 100%|██████████| 2372/2372 [01:11<00:00, 33.08it/s]


Processing directory: ../data/raw/train/audio\go
Processed 0 files so far...


Processing class 'go': 100%|██████████| 2372/2372 [01:40<00:00, 23.60it/s]


Processing directory: ../data/raw/train/audio\happy
Processed 0 files so far...


Processing class 'happy': 100%|██████████| 1742/1742 [00:34<00:00, 49.80it/s]


Processing directory: ../data/raw/train/audio\house
Processed 0 files so far...


Processing class 'house': 100%|██████████| 1750/1750 [00:29<00:00, 60.12it/s]


Processing directory: ../data/raw/train/audio\left
Processed 0 files so far...


Processing class 'left': 100%|██████████| 2353/2353 [00:35<00:00, 67.11it/s]


Processing directory: ../data/raw/train/audio\marvin
Processed 0 files so far...


Processing class 'marvin': 100%|██████████| 1746/1746 [00:26<00:00, 66.58it/s]


Processing directory: ../data/raw/train/audio\nine
Processed 0 files so far...


Processing class 'nine': 100%|██████████| 2364/2364 [01:08<00:00, 34.39it/s]


Processing directory: ../data/raw/train/audio\no
Processed 0 files so far...


Processing class 'no': 100%|██████████| 2375/2375 [01:12<00:00, 32.64it/s]


Processing directory: ../data/raw/train/audio\off
Processed 0 files so far...


Processing class 'off': 100%|██████████| 2357/2357 [01:50<00:00, 21.33it/s]


Processing directory: ../data/raw/train/audio\on
Processed 0 files so far...


Processing class 'on': 100%|██████████| 2367/2367 [01:03<00:00, 37.25it/s]


Processing directory: ../data/raw/train/audio\one
Processed 0 files so far...


Processing class 'one': 100%|██████████| 2370/2370 [01:14<00:00, 31.93it/s]


Processing directory: ../data/raw/train/audio\right
Processed 0 files so far...


Processing class 'right': 100%|██████████| 2367/2367 [01:49<00:00, 21.63it/s]


Processing directory: ../data/raw/train/audio\seven
Processed 0 files so far...


Processing class 'seven': 100%|██████████| 2377/2377 [01:06<00:00, 35.93it/s]


Processing directory: ../data/raw/train/audio\sheila
Processed 0 files so far...


Processing class 'sheila': 100%|██████████| 1734/1734 [00:32<00:00, 52.68it/s]


Processing directory: ../data/raw/train/audio\six
Processed 0 files so far...


Processing class 'six': 100%|██████████| 2369/2369 [00:43<00:00, 54.94it/s]


Processing directory: ../data/raw/train/audio\stop
Processed 0 files so far...


Processing class 'stop': 100%|██████████| 2380/2380 [00:44<00:00, 53.62it/s]


Processing directory: ../data/raw/train/audio\three
Processed 0 files so far...


Processing class 'three': 100%|██████████| 2356/2356 [00:45<00:00, 52.23it/s]


Processing directory: ../data/raw/train/audio\tree
Processed 0 files so far...


Processing class 'tree': 100%|██████████| 1733/1733 [00:39<00:00, 44.36it/s]


Processing directory: ../data/raw/train/audio\two
Processed 0 files so far...


Processing class 'two': 100%|██████████| 2373/2373 [00:44<00:00, 53.83it/s]


Processing directory: ../data/raw/train/audio\up
Processed 0 files so far...


Processing class 'up': 100%|██████████| 2375/2375 [00:42<00:00, 56.14it/s]


Processing directory: ../data/raw/train/audio\wow
Processed 0 files so far...


Processing class 'wow': 100%|██████████| 1745/1745 [00:31<00:00, 55.85it/s]


Processing directory: ../data/raw/train/audio\yes
Processed 0 files so far...


Processing class 'yes': 100%|██████████| 2377/2377 [00:42<00:00, 55.41it/s]


Processing directory: ../data/raw/train/audio\zero
Processed 0 files so far...


Processing class 'zero': 100%|██████████| 2376/2376 [00:42<00:00, 55.54it/s]


Processing directory: ../data/raw/train/audio\_background_noise_
Processed 0 files so far...


  sr, waveform = wavfile.read(input_file_path)
Processing class '_background_noise_': 100%|██████████| 7/7 [00:00<00:00, 22.63it/s]


In [3]:
import os 

input_base_dir = "../data/raw/train/audio"


for files in os.listdir(input_base_dir):
    print(files)

bed
bird
cat
dog
down
eight
five
four
go
happy
house
left
marvin
nine
no
off
on
one
right
seven
sheila
six
stop
three
tree
two
up
wow
yes
zero
_background_noise_


In [2]:
import os
import torch
from scipy.io import wavfile
from torchaudio.transforms import MelSpectrogram
import librosa
from tqdm import tqdm

input_base_dir = "../data/raw/train/audio"
output_base_dir = "../data/processed/train/audio"
sample_rate = 16000
mel_transform = MelSpectrogram(sample_rate=sample_rate)

os.makedirs(output_base_dir, exist_ok=True)


def process_background_noise(
    input_dir, output_dir, transform, sample_rate=16000, chunk_duration=1
):
    background_dir = os.path.join(input_dir, "_background_noise_")
    silence_output_dir = os.path.join(
        output_dir, "silence"
    )  # Save into a new "silence" folder
    os.makedirs(silence_output_dir, exist_ok=True)

    chunk_size = sample_rate * chunk_duration
    counter = 0
    log = []

    for file in os.listdir(background_dir):
        if file.endswith(".wav"):
            input_file_path = os.path.join(background_dir, file)
            try:
                sr, waveform = wavfile.read(input_file_path)
                if len(waveform.shape) > 1:
                    waveform = waveform[:, 0]
                if sr != sample_rate:
                    waveform = librosa.resample(
                        waveform.astype(float), orig_sr=sr, target_sr=sample_rate
                    )
                waveform = torch.tensor(waveform, dtype=torch.float32)

                total_chunks = len(waveform) // chunk_size

                for i in range(total_chunks):
                    chunk = waveform[i * chunk_size : (i + 1) * chunk_size]
                    mel_spec = transform(chunk)
                    output_file_path = os.path.join(
                        silence_output_dir, f"silence_{counter:04d}.pt"
                    )
                    torch.save(mel_spec, output_file_path)
                    counter += 1
                    log.append(f"Saved silence: {output_file_path}")

            except Exception as e:
                log.append(f"Failed: {input_file_path} with error {str(e)}")

    print(f"Generated {counter} silence samples.")
    return log

# Process background noise
# After running process_audio_to_mel
log_silence = process_background_noise(input_base_dir, output_base_dir, mel_transform)

  sr, waveform = wavfile.read(input_file_path)


Generated 398 silence samples.
