# ðŸŽµ Spectral Affinity: Full GPU Optimization (Hearing + Brain)

This version is specialized for **Kaggle GPU T4**. It moves both the Feature Extraction (Hearing) and the Clustering (Brain) to the GPU, and uses high-speed shell commands for Zipping.

## What's Optimized:
- **Hearing (Speed)**: Uses `soundfile` for seekable loading and `torchaudio` for GPU math.
- **Brain (GPU)**: Uses **RAPIDS cuML KMeans** on the T4 GPU.
- **Storage (ZIP)**: Uses `zip -0` (Store mode) to skip slow compression logic (perfect for music files).
- **Organization**: Files are prefixed with their cluster letter (e.g., [A], [B], [C]) for easier navigation.

---

### 1. Verification of GPU

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
!pip install soundfile torchaudio

### 2. Imports

In [None]:
import os
import re
import glob
import shutil
import pathlib
import numpy as np
import soundfile as sf
import torch
import torchaudio
import torchaudio.transforms as T
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler

try:
    import cuml
    from cuml.cluster import KMeans as cuKMeans
    HAS_CUML = True
except ImportError:
    from sklearn.cluster import KMeans as skKMeans
    HAS_CUML = False

### 3. High-Speed Logic

In [None]:
def clean_filename(filename):
    if '.' not in filename: return filename
    name_body, ext = filename.rsplit('.', 1)
    prefixes = [r"^Slavic-", r"^Theme_OST-", r"^My_Workspace-", r"^audio-"]
    for prefix in prefixes: name_body = re.sub(prefix, "", name_body, flags=re.IGNORECASE)
    uuid_pattern = r"[\(\.\-_\s]?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}[\)]?$"
    name_body = re.sub(uuid_pattern, "", name_body)
    name_body = name_body.replace("_", " ").strip(" -(_)")
    name_body = re.sub(r"\s+", " ", name_body).strip()
    return f"{name_body if name_body else 'Unnamed'}.{ext}"

def get_fast_features(file_path, duration=30):
    try:
        info = sf.info(file_path)
        sr, total_frames = info.samplerate, info.frames
        desired_f = duration * sr
        start_f = max(0, (total_frames - desired_f) // 2)
        
        y, _ = sf.read(file_path, frames=desired_f, start=start_f, dtype='float32')
        if len(y.shape) > 1: y = y.mean(axis=1)
        
        waveform = torch.from_numpy(y).to(device).unsqueeze(0)
        mfcc_transform = T.MFCC(sample_rate=sr, n_mfcc=13, 
                                melkwargs={"n_fft": 2048, "hop_length": 512, "n_mels": 64}).to(device)
        mfcc = mfcc_transform(waveform).squeeze(0).cpu().numpy()
        
        import librosa
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        
        return np.hstack([
            np.mean(mfcc, axis=1), np.var(mfcc, axis=1), 
            float(tempo if isinstance(tempo, (float, int)) else tempo[0])
        ])
    except Exception: return None

def organize_files(file_paths, labels, output_dir, mode='copy', rename=False):
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    cluster_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    
    for i, file_path in enumerate(file_paths):
        label = labels[i]
        # Map label to letter (0->A, 1->B, ...)
        c_letter = cluster_letters[label % len(cluster_letters)]
        
        cluster_dir = os.path.join(output_dir, f"Cluster_{c_letter}")
        pathlib.Path(cluster_dir).mkdir(exist_ok=True)
        
        filename = os.path.basename(file_path)
        if rename: filename = clean_filename(filename)
        
        # Add Cluster Prefix to individual filename
        filename = f"[{c_letter}] {filename}"
        
        name_pure, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
        dest_path = os.path.join(cluster_dir, filename)
        counter = 1
        while os.path.exists(dest_path):
            new_filename = f"{name_pure} ({counter}).{ext}" if ext else f"{name_pure} ({counter})"
            dest_path = os.path.join(cluster_dir, new_filename)
            counter += 1
        shutil.copy2(file_path, dest_path)

### 4. Configuration and Execution

In [None]:
# --- CONFIGURATION ---
INPUT_DIR = "/kaggle/input/datasets/danieldobles/ost-songs"
OUTPUT_DIR = "/kaggle/working/organized_music"
N_CLUSTERS = 5
CLEAN_NAMES = True
# ---------------------

audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.ogg', '*.m4a']
file_paths = []
for ext in audio_extensions:
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, ext)))
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, "**", ext), recursive=True))
file_paths = list(set(file_paths))

if not file_paths:
    print(f"No audio files found in {INPUT_DIR}!")
else:
    print(f"Found {len(file_paths)} files. Starting GPU Extraction...")
    results = Parallel(n_jobs=4)(delayed(get_fast_features)(p) for p in file_paths)
    
    valid_features = [r for r in results if r is not None]
    valid_paths = [p for r, p in zip(results, file_paths) if r is not None]
    
    if valid_features:
        print(f"Clustering using {'RAPIDS GPU' if HAS_CUML else 'CPU'}...")
        scaler = StandardScaler()
        X = scaler.fit_transform(np.array(valid_features).astype('float32'))
        
        if HAS_CUML:
            labels = cuKMeans(n_clusters=N_CLUSTERS).fit_predict(X)
        else:
            labels = skKMeans(n_clusters=N_CLUSTERS, n_init=10).fit_predict(X)
        
        organize_files(valid_paths, labels, OUTPUT_DIR, mode='copy', rename=CLEAN_NAMES)
        print(f"Done! Created {N_CLUSTERS} clusters and organized results.")

### 5. Download Results (ZIP - High Speed Mode)

In [None]:
import os
if os.path.exists(OUTPUT_DIR):
    print("Zipping results (Using Store mode for maximum speed)...")
    # -0 skips compression (best for music), -r is recursive, -q is quiet
    !zip -0 -rq /kaggle/working/organized_results.zip organized_music
    print("Success! 'organized_results.zip' is ready in /kaggle/working/")