# üéµ Spectral Affinity: High-Speed Parallel GPU Pipeline

This version uses **Parallel CPU Extraction** (Hearing) and **GPU Clustering** (Brain) for maximum performance on Kaggle T4.

---

### 1. Environment Setup

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
!pip install librosa soundfile tqdm joblib

### 2. Imports

In [None]:
import os
import re
import glob
import shutil
import pathlib
import warnings
import librosa
import numpy as np
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

try:
    import cuml
    from cuml.cluster import KMeans as cuKMeans
    HAS_CUML = True
except ImportError:
    from sklearn.cluster import KMeans as skKMeans
    HAS_CUML = False

### 3. Logic

In [None]:
def clean_filename(filename):
    if '.' not in filename: return filename
    name_body, ext = filename.rsplit('.', 1)
    prefixes = [r"^Slavic-", r"^Theme_OST-", r"^My_Workspace-", r"^audio-"]
    for prefix in prefixes: name_body = re.sub(prefix, "", name_body, flags=re.IGNORECASE)
    uuid_pattern = r"[\(\.\-_\s]?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}[\)]?$"
    name_body = re.sub(uuid_pattern, "", name_body)
    name_body = name_body.replace("_", " ").strip(" -(_)")
    name_body = re.sub(r"\s+", " ", name_body).strip()
    return f"{name_body if name_body else 'Unnamed'}.{ext}"

def get_features(file_path, duration=30):
    try:
        total_dur = librosa.get_duration(path=file_path)
        offset = max(0, (total_dur - duration) // 2)
        y, sr = librosa.load(file_path, sr=22050, duration=duration, offset=offset)
        if len(y) == 0: return None
        
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        res = librosa.beat.beat_track(y=y, sr=sr)
        tempo = res[0]
        
        return np.hstack([
            np.mean(mfcc, axis=1), np.var(mfcc, axis=1),
            np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
            float(tempo if isinstance(tempo, (float, int, np.float32, np.float64)) else tempo[0])
        ])
    except Exception: return None

def organize_files(file_paths, labels, output_dir, mode='copy', rename=False):
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    cluster_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    
    for i, file_path in enumerate(tqdm(file_paths, desc="üìÅ Organizing", leave=True)):
        label = labels[i]
        c_letter = cluster_letters[label % len(cluster_letters)]
        cluster_dir = os.path.join(output_dir, f"Cluster_{c_letter}")
        pathlib.Path(cluster_dir).mkdir(exist_ok=True)
        filename = os.path.basename(file_path)
        if rename: filename = clean_filename(filename)
        filename = f"[{c_letter}] {filename}"
        
        name_pure, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
        dest_path = os.path.join(cluster_dir, filename)
        counter = 1
        while os.path.exists(dest_path):
            new_filename = f"{name_pure} ({counter}).{ext}" if ext else f"{name_pure} ({counter})"
            dest_path = os.path.join(cluster_dir, new_filename)
            counter += 1
        shutil.copy2(file_path, dest_path)

### 4. Pipeline Execution

In [None]:
# --- CONFIGURATION ---
INPUT_DIR = "/kaggle/input/datasets/danieldobles/ost-songs"
OUTPUT_DIR = "/kaggle/working/organized_music"
N_CLUSTERS = 5
CLEAN_NAMES = True
NUM_CORES = 4 # Kaggle T4 typically has 4 CPU cores
# ---------------------

print("üéµ --- SPECTRAL AFFINITY PIPELINE ---")

print("üîç Step 1/3: Mapping library...")
audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.ogg', '*.m4a']
file_paths = []
for ext in audio_extensions:
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, ext)))
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, "**", ext), recursive=True))
file_paths = list(set(file_paths))
print(f"‚úÖ Done: Found {len(file_paths)} files.")

if not file_paths:
    print(f"‚ùå ERROR: Nothing found!")
else:
    print(f"\nüß† Step 2/3: Feature Extraction (Parallel CPU - {NUM_CORES} cores)...")
    # Parallel execution with visible progress bar
    results = Parallel(n_jobs=NUM_CORES)(
        delayed(get_features)(p) for p in tqdm(file_paths, desc="üéß Hearing", leave=True)
    )
    
    valid_features = [r for r in results if r is not None]
    valid_paths = [p for r, p in zip(results, file_paths) if r is not None]
    print(f"‚úÖ Done: Successully processed {len(valid_features)} files.")
    
    if valid_features:
        print(f"\nü§ñ Step 3/3: Clustering & Organizing {'(GPU)' if HAS_CUML else '(CPU)'}...")
        scaler = StandardScaler()
        X = scaler.fit_transform(np.array(valid_features).astype('float32'))
        if HAS_CUML:
            labels = cuKMeans(n_clusters=N_CLUSTERS).fit_predict(X)
        else:
            labels = skKMeans(n_clusters=N_CLUSTERS, n_init=10).fit_predict(X)
        
        organize_files(valid_paths, labels, OUTPUT_DIR, mode='copy', rename=CLEAN_NAMES)
        print(f"\n‚ú® SUCCESS: Your files are in {OUTPUT_DIR}")
    else:
        print("‚ùå ERROR: Processing failed.")

### 5. Final Download

In [None]:
if os.path.exists(OUTPUT_DIR):
    print("üì¶ Packaging results (ZIP)...")
    !zip -0 -rq /kaggle/working/organized_results.zip organized_music
    print(f"‚úÖ Done! File ready at /kaggle/working/organized_results.zip")