# ðŸŽµ Spectral Affinity: Automated Audio Grouping

This notebook automates the grouping of audio files based on their mathematical features (tempo, timbre, and brightness) using **Librosa** and **Scikit-learn**.

## How it works:
1. **Extraction**: It loads the center 30 seconds of each song.
2. **Fingerprinting**: It extracts MFCCs, Spectral Centroid, and Tempo.
3. **Clustering**: It uses KMeans to group similar songs.
4. **Organization**: it copies the files into organized folders.

---

### 1. Install Dependencies

In [None]:
!pip install librosa scikit-learn numpy soundfile joblib

### 2. Imports

In [None]:
import os
import re
import glob
import time
import shutil
import pathlib
import librosa
import numpy as np
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

### 3. Core Logic

In [None]:
def clean_filename(filename):
    """
    Cleans audio filenames by removing common prefixes and UUID suffixes.
    """
    if '.' not in filename: return filename
    name_body, ext = filename.rsplit('.', 1)
    
    prefixes = [r"^Slavic-", r"^Theme_OST-", r"^My_Workspace-", r"^audio-"]
    for prefix in prefixes:
        name_body = re.sub(prefix, "", name_body, flags=re.IGNORECASE)
        
    uuid_pattern = r"-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
    name_body = re.sub(uuid_pattern, "", name_body)
    name_body = name_body.replace("_", " ")
    name_body = re.sub(r"\s+", " ", name_body).strip()
    
    return f"{name_body}.{ext}"

def extract_features(file_path, duration=30):
    """
    Optimized feature extraction (loads only 30s center).
    """
    try:
        total_duration = librosa.get_duration(path=file_path)
        start_time = max(0, (total_duration - duration) // 2)
        y, sr = librosa.load(file_path, sr=22050, duration=duration, offset=start_time)
        
        if len(y) == 0: return None

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        features = np.hstack([
            np.mean(mfcc, axis=1), np.var(mfcc, axis=1),
            np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
            np.var(librosa.feature.spectral_centroid(y=y, sr=sr)),
            librosa.beat.beat_track(y=y, sr=sr)[0]
        ])
        return features
    except Exception as e:
        return None

def organize_files(file_paths, labels, output_dir, mode='copy', rename=False):
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    for i, file_path in enumerate(file_paths):
        label = labels[i]
        cluster_dir = os.path.join(output_dir, f"Cluster_{label}")
        pathlib.Path(cluster_dir).mkdir(exist_ok=True)
        
        filename = os.path.basename(file_path)
        if rename: filename = clean_filename(filename)
            
        dest_path = os.path.join(cluster_dir, filename)
        if mode == 'copy': shutil.copy2(file_path, dest_path)
        else: shutil.move(file_path, dest_path)

### 4. Configuration and Execution

Adjust the paths below. In Kaggle, datasets are usually in `../input/dataset-name`.

In [None]:
# --- CONFIGURATION ---
INPUT_DIR = "/kaggle/input/datasets/danieldobles/ost-songs" # Path updated to your dataset
OUTPUT_DIR = "./organized_music"
N_CLUSTERS = 5
CLEAN_NAMES = True
# ---------------------

audio_extensions = ['*.mp3', '*.wav', '*.flac', '*.ogg', '*.m4a']
file_paths = []
for ext in audio_extensions:
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, ext)))
    file_paths.extend(glob.glob(os.path.join(INPUT_DIR, "**", ext), recursive=True))

file_paths = list(set(file_paths)) # Remove duplicates

if not file_paths:
    print(f"No audio files found in {INPUT_DIR}!")
else:
    print(f"Found {len(file_paths)} files. Starting extraction...")
    
    # Parallel Extraction
    results = Parallel(n_jobs=-1)(delayed(extract_features)(p) for p in file_paths)
    
    valid_features = [r for r in results if r is not None]
    valid_paths = [p for r, p in zip(results, file_paths) if r is not None]
    
    if valid_features:
        # Clustering
        scaler = StandardScaler()
        scaled = scaler.fit_transform(np.array(valid_features))
        labels = KMeans(n_clusters=N_CLUSTERS, random_state=42).fit_predict(scaled)
        
        # Organization
        organize_files(valid_paths, labels, OUTPUT_DIR, mode='copy', rename=CLEAN_NAMES)
        print(f"Done! Organized into {N_CLUSTERS} clusters in {OUTPUT_DIR}")
    else:
        print("Failed to extract features from any file.")

### 5. Download Results (ZIP)

Kaggle doesn't allow easy folder downloads. Run this cell to compress your results into a single ZIP file.

In [None]:
if os.path.exists(OUTPUT_DIR):
    shutil.make_archive('organized_results', 'zip', OUTPUT_DIR)
    print("Success! 'organized_results.zip' is ready for download in the output folder.")
else:
    print("Organized directory not found. Please run the previous cell first.")