# UrbanSound8K → Log-Mel Spectrogram Generator (Colab / Jupyter)

This notebook:
1. Downloads **UrbanSound8K** from Kaggle
2. Unzips it
3. Converts each `.wav` clip into a **log-mel spectrogram**
4. Saves spectrograms as `.png` images (and optionally as `.npy` arrays)

## Prereqs (Kaggle download)
You need a Kaggle API token:
- Kaggle → Account → *Create New API Token* → downloads `kaggle.json`
- Upload `kaggle.json` into this notebook runtime (or place it on the machine)

**Do not commit `kaggle.json` to GitHub.**


In [None]:
# ===== Install dependencies (safe to run multiple times) =====
import sys, subprocess, os, pathlib

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

pip_install(["kaggle", "librosa", "soundfile", "numpy", "matplotlib", "tqdm"])


## 1) Configure Kaggle API

Place `kaggle.json` in the working directory (same folder as this notebook), or set the path below.


In [None]:
# ===== Kaggle API setup =====
import os, pathlib, shutil

# Change this if your kaggle.json is elsewhere
KAGGLE_JSON_PATH = "kaggle.json"

assert os.path.exists(KAGGLE_JSON_PATH), (
    "kaggle.json not found. Upload it to the runtime or set KAGGLE_JSON_PATH correctly."
)

# Kaggle expects ~/.kaggle/kaggle.json with strict permissions
kaggle_dir = pathlib.Path.home() / ".kaggle"
kaggle_dir.mkdir(parents=True, exist_ok=True)

dest = kaggle_dir / "kaggle.json"
shutil.copy(KAGGLE_JSON_PATH, dest)

# Strict permissions
os.chmod(dest, 0o600)

print(f"Kaggle token installed at: {dest}")


## 2) Download UrbanSound8K from Kaggle
Dataset: `chrisfilo/urbansound8k`


In [None]:
# ===== Download dataset =====
import subprocess, sys, pathlib

DATA_ROOT = pathlib.Path("data")
DATA_ROOT.mkdir(exist_ok=True)

dataset_slug = "chrisfilo/urbansound8k"
zip_path = DATA_ROOT / "urbansound8k.zip"

if not zip_path.exists():
    subprocess.check_call(["kaggle", "datasets", "download", "-d", dataset_slug, "-p", str(DATA_ROOT), "--force"])
    # Kaggle names the zip after the dataset slug; locate it
    # Commonly: urbansound8k.zip
    # If name differs, find the newest zip in DATA_ROOT
    zips = sorted(DATA_ROOT.glob("*.zip"), key=lambda p: p.stat().st_mtime, reverse=True)
    assert zips, "No zip found after Kaggle download."
    if zips[0] != zip_path:
        zips[0].rename(zip_path)

print(f"Zip ready: {zip_path} ({zip_path.stat().st_size/1e6:.1f} MB)")


In [None]:
# ===== Unzip =====
import zipfile, pathlib

UNZIP_DIR = DATA_ROOT / "UrbanSound8K"
if not UNZIP_DIR.exists():
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(DATA_ROOT)
print(f"Unzipped to: {UNZIP_DIR.resolve()}")


## 3) Generate Log-Mel Spectrograms

We generate a **log-mel spectrogram** per clip, then save as:
- PNG image (default)
- optional `.npy` array (for faster training sweeps)

Notes:
- We **resample** audio to `target_sr` (default 22050 Hz; change to 16000 if you prefer).
- We use typical settings: `n_mels=128`, `n_fft=2048`, `hop_length=512`.
- Images are saved per fold in: `outputs/spectrograms/foldX/<class>/<filename>.png`


In [None]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path

META_CSV = UNZIP_DIR / "metadata" / "UrbanSound8K.csv"
AUDIO_ROOT = UNZIP_DIR / "audio"

assert META_CSV.exists(), f"Metadata CSV not found: {META_CSV}"
assert AUDIO_ROOT.exists(), f"Audio folder not found: {AUDIO_ROOT}"

df = pd.read_csv(META_CSV)
df.head(), df.shape


In [None]:
# ===== Spectrogram parameters =====
target_sr = 22050          # consider 16000 for speech-like workloads
n_fft = 2048
hop_length = 512
n_mels = 128
fmin = 0
fmax = None               # None -> sr/2
use_log = True            # log-mel (recommended)

# Output options
OUT_ROOT = Path("outputs") / "spectrograms"
SAVE_PNG = True
SAVE_NPY = False          # set True if you also want numpy arrays
DPI = 120                 # lower -> smaller files, faster writing

OUT_ROOT.mkdir(parents=True, exist_ok=True)
print("Output root:", OUT_ROOT.resolve())


In [None]:
def wav_to_logmel(y: np.ndarray, sr: int) -> np.ndarray:
    """Return log-mel spectrogram as float32 array shape (n_mels, T)."""
    # If stereo, convert to mono
    if y.ndim > 1:
        y = np.mean(y, axis=0)

    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    S = librosa.feature.melspectrogram(
        y=y, sr=sr,
        n_fft=n_fft, hop_length=hop_length,
        n_mels=n_mels, fmin=fmin, fmax=fmax,
        power=2.0
    )
    if use_log:
        S = librosa.power_to_db(S, ref=np.max)
    return S.astype(np.float32)

def save_spectrogram_png(S: np.ndarray, out_path: Path):
    """Save spectrogram array as PNG without axes/ticks."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    plt.figure(figsize=(3.0, 3.0), dpi=DPI)
    plt.axis('off')
    # Use imshow for speed; S already in dB if use_log
    plt.imshow(S, aspect='auto', origin='lower')
    plt.tight_layout(pad=0)
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

def save_spectrogram_npy(S: np.ndarray, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    np.save(out_path, S)


### Convert the dataset

By default this converts **all 8732 clips**.  
If you want a quick test first, set `MAX_FILES = 50` (or similar).


In [None]:
import soundfile as sf

MAX_FILES = None  # e.g., 50 for quick test; None for full dataset

errors = []
processed = 0

iter_df = df if MAX_FILES is None else df.iloc[:MAX_FILES]

for row in tqdm(iter_df.itertuples(index=False), total=len(iter_df)):
    fold = int(row.fold)
    cls = str(row.class)
    fname = str(row.slice_file_name)

    wav_path = AUDIO_ROOT / f"fold{fold}" / fname
    if not wav_path.exists():
        errors.append((fname, "missing_wav"))
        continue

    try:
        y, sr = sf.read(wav_path)
        S = wav_to_logmel(y, sr)

        base = wav_path.stem  # no .wav

        if SAVE_PNG:
            out_png = OUT_ROOT / f"fold{fold}" / cls / f"{base}.png"
            save_spectrogram_png(S, out_png)

        if SAVE_NPY:
            out_npy = OUT_ROOT / f"fold{fold}" / cls / f"{base}.npy"
            save_spectrogram_npy(S, out_npy)

        processed += 1

    except Exception as e:
        errors.append((fname, repr(e)))

print(f"Processed: {processed} files")
print(f"Errors: {len(errors)}") 
if errors[:5]:
    print("Sample errors:", errors[:5])


## 4) Quick Visual Sanity Check

This displays one spectrogram image (from the generated files).


In [None]:
from glob import glob
import matplotlib.image as mpimg

pngs = sorted(glob(str(OUT_ROOT / "fold*" / "*" / "*.png")))
assert pngs, "No PNGs found. Did you run conversion with SAVE_PNG=True?"

sample = pngs[len(pngs)//2]
img = mpimg.imread(sample)

print("Sample:", sample)
plt.figure(figsize=(6,4))
plt.imshow(img)
plt.axis("off")
plt.show()


## 5) Where the outputs are

- Spectrogram PNGs: `outputs/spectrograms/foldX/<class>/<clip>.png`
- (Optional) NPY arrays: same structure with `.npy`

Next step for your project:
- Build a PyTorch dataset that reads from this directory
- Use the **fold** directory as your split key for 10-fold cross validation
