In [None]:
from pathlib import Path
import urllib.request
import tarfile
import torch
import torchaudio
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def download_speech_commands(data_path="./speech_commands_data"):
    """
    Download and extract Google Speech Commands dataset v0.02

    Returns:
        bool: True if successful, False otherwise
    """
    data_path = Path(data_path)

    if data_path.exists() and any(data_path.iterdir()):
        print("✅ Dataset already exists")
        return True

    print("📥 Downloading Google Speech Commands dataset...")
    print("Dataset info: https://arxiv.org/abs/1804.03209")

    url = "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
    tar_path = data_path.parent / "speech_commands_v0.02.tar.gz"

    try:
        # Create directory
        data_path.mkdir(parents=True, exist_ok=True)

        # Download with progress
        print(f"Downloading from: {url}")
        print("Size: ~2GB - this may take several minutes...")

        def progress_hook(block_num, block_size, total_size):
            downloaded = block_num * block_size
            if total_size > 0:
                percent = min(100, downloaded * 100 / total_size)
                print(f"\rDownload progress: {percent:.1f}%", end="", flush=True)

        urllib.request.urlretrieve(url, tar_path, reporthook=progress_hook)
        print("\n✅ Download complete")

        # Extract
        print("📦 Extracting dataset...")
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extractall(data_path)

        # Cleanup
        tar_path.unlink()
        print("✅ Dataset extraction complete")

        # Verify
        commands_found = [d.name for d in data_path.iterdir() if d.is_dir()]
        print(f"Found {len(commands_found)} command categories")

        return True

    except Exception as e:
        print(f"❌ Error downloading dataset: {e}")
        print("\nManual download instructions:")
        print(
            "1. Download: https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
        )
        print(f"2. Extract to: {data_path}")
        return False

# Run the download and extraction process.
download_speech_commands(data_path="./speech_commands_data")

📥 Downloading Google Speech Commands dataset...
Dataset info: https://arxiv.org/abs/1804.03209
Downloading from: https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz
Size: ~2GB - this may take several minutes...
Download progress: 100.0%
✅ Download complete
📦 Extracting dataset...
✅ Dataset extraction complete
Found 36 command categories


True

In [None]:
# Define the 10 core commands
commands = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

# Define the data path
data_path = Path("./speech_commands_data")

# Read validation and testing lists
with open(data_path / 'validation_list.txt', 'r') as f:
    val_rel_paths = set(f.read().splitlines())
with open(data_path / 'testing_list.txt', 'r') as f:
    test_rel_paths = set(f.read().splitlines())

# Get all .wav files in the 10 command directories
all_files = []
for command in commands:
    command_dir = data_path / command
    wav_files = list(command_dir.glob('*.wav'))
    all_files.extend(wav_files)

# Assign files to train, val, test
train_files = []
val_files = []
test_files = []
for file in all_files:
    rel_path = str(file.relative_to(data_path))
    if rel_path in val_rel_paths:
        val_files.append(file)
    elif rel_path in test_rel_paths:
        test_files.append(file)
    else:
        train_files.append(file)

print(f"Number of training files: {len(train_files)}")
print(f"Number of validation files: {len(val_files)}")
print(f"Number of testing files: {len(test_files)}")

# Define label map
label_map = {command: i for i, command in enumerate(commands)}

Number of training files: 30769
Number of validation files: 3703
Number of testing files: 4074


In [None]:
# Define MFCC transform
mfcc_transform = torchaudio.transforms.MFCC(
    sample_rate=16000,
    n_mfcc=20,
    melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 40}
)

# Function to compute MFCCs
def compute_mfccs(file_list, label_map, transform):
    mfccs = []
    labels = []
    max_len = 0

    # Compute all MFCCs and find the max sequence length
    for file in tqdm(file_list, desc="Computing MFCCs"):
        waveform, sr = torchaudio.load(file)
        if sr != 16000:
            print(f"Warning: sample rate {sr} for {file}")
        mfcc = transform(waveform).squeeze(0).transpose(0, 1)  # Shape: (seq_len, n_mfcc)
        max_len = max(max_len, mfcc.shape[0])
        mfccs.append(mfcc)
        labels.append(label_map[file.parent.name])

    # Pad all to max_len
    padded_mfccs = []
    for mfcc in mfccs:
        padded = F.pad(mfcc, (0, 0, 0, max_len - mfcc.shape[0]))
        padded_mfccs.append(padded)

    mfccs_tensor = torch.stack(padded_mfccs)  # Shape: (num_samples, seq_len, n_mfcc)
    labels_tensor = torch.tensor(labels)  # Shape: (num_samples,)
    return mfccs_tensor, labels_tensor

# Compute MFCCs for each split
train_mfccs, train_labels = compute_mfccs(train_files, label_map, mfcc_transform)
val_mfccs, val_labels = compute_mfccs(val_files, label_map, mfcc_transform)
test_mfccs, test_labels = compute_mfccs(test_files, label_map, mfcc_transform)

# Save to Google Drive
save_path = Path('/content/drive/MyDrive/speech_commands_mfccs')
save_path.mkdir(parents=True, exist_ok=True)
torch.save({'mfccs': train_mfccs, 'labels': train_labels}, save_path / 'train.pt')
torch.save({'mfccs': val_mfccs, 'labels': val_labels}, save_path / 'val.pt')
torch.save({'mfccs': test_mfccs, 'labels': test_labels}, save_path / 'test.pt')
print("MFCCs saved to Google Drive.")

Computing MFCCs: 100%|██████████| 30769/30769 [03:12<00:00, 159.80it/s]
Computing MFCCs: 100%|██████████| 3703/3703 [00:22<00:00, 167.19it/s]
Computing MFCCs: 100%|██████████| 4074/4074 [00:24<00:00, 168.77it/s]


MFCCs saved to Google Drive.
