In [None]:
import librosa
import numpy as np
import torch
from transformers import ClapModel, ClapProcessor
import warnings
warnings.filterwarnings('ignore')

class EmotionalArcExtractor:

    def __init__(self):
        # Load CLAP model for embeddings
        print("⏳ Loading CLAP model (this takes ~30 seconds)...")
        self.clap_model = ClapModel.from_pretrained("laion/larger_clap_music_and_speech")
        self.clap_processor = ClapProcessor.from_pretrained("laion/larger_clap_music_and_speech")

        if torch.cuda.is_available():
            self.clap_model = self.clap_model.to('cuda')
            print("✅ CLAP loaded on GPU")
        else:
            print("✅ CLAP loaded on CPU")

    def extract_static_features(self, audio, sr):
        """Extract CLAP embedding + basic audio features"""

        # 1. CLAP Embedding (512-d)
        # Resample to 48kHz for CLAP
        if sr != 48000:
            audio_48k = librosa.resample(audio, orig_sr=sr, target_sr=48000)
        else:
            audio_48k = audio

        inputs = self.clap_processor(
            audios=audio_48k,
            sampling_rate=48000,
            return_tensors="pt"
        )

        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        with torch.no_grad():
            embedding = self.clap_model.get_audio_features(**inputs)
            embedding = embedding.cpu().numpy()[0]  # Shape: (512,)

        # 2. Basic audio features
        tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
        rms = np.mean(librosa.feature.rms(y=audio))
        zcr = np.mean(librosa.feature.zero_crossing_rate(audio))

        return {
            'clap_embedding': embedding,
            'tempo': float(tempo),
            'spectral_centroid_mean': float(spectral_centroid),
            'rms_mean': float(rms),
            'zcr_mean': float(zcr)
        }

    def extract_temporal_features(self, audio, sr, n_windows=10):

        # Split audio into windows
        window_length = len(audio) // n_windows

        valence_trajectory = []
        arousal_trajectory = []
        spectral_rolloff_trajectory = []

        for i in range(n_windows):
            start = i * window_length
            end = start + window_length
            window = audio[start:end]

            # Valence proxy: Spectral centroid (brightness)
            centroid = np.mean(librosa.feature.spectral_centroid(y=window, sr=sr))
            valence_trajectory.append(float(centroid))

            # Arousal proxy: RMS energy
            energy = np.mean(librosa.feature.rms(y=window))
            arousal_trajectory.append(float(energy))

            # Spectral rolloff
            rolloff = np.mean(librosa.feature.spectral_rolloff(y=window, sr=sr))
            spectral_rolloff_trajectory.append(float(rolloff))

        # Normalize trajectories to [0, 1]
        def normalize(arr):
            arr = np.array(arr)
            if arr.max() == arr.min():
                return [0.5] * len(arr)
            return ((arr - arr.min()) / (arr.max() - arr.min())).tolist()

        valence_norm = normalize(valence_trajectory)
        arousal_norm = normalize(arousal_trajectory)

        # Classify the arc type
        arc_type, arc_metrics = self.classify_arc(valence_norm, arousal_norm)

        return {
            'valence_trajectory': valence_norm,
            'arousal_trajectory': arousal_norm,
            'spectral_rolloff_trajectory': normalize(spectral_rolloff_trajectory),
            'arc_type': arc_type,
            'arc_slope_valence': arc_metrics['valence_slope'],
            'arc_slope_arousal': arc_metrics['arousal_slope'],
            'arc_variance_valence': arc_metrics['valence_var'],
            'arc_variance_arousal': arc_metrics['arousal_var']
        }

    def classify_arc(self, valence, arousal):

        # Calculate slopes
        x = np.arange(len(arousal))
        arousal_slope = np.polyfit(x, arousal, 1)[0]
        valence_slope = np.polyfit(x, valence, 1)[0]

        # Calculate variance
        arousal_var = np.var(arousal)
        valence_var = np.var(valence)

        # Check for spike in latter half
        latter_half_arousal = arousal[len(arousal)//2:]
        has_spike = max(latter_half_arousal) > (np.mean(arousal) + np.std(arousal))

        # Classification logic
        if arousal_slope > 0.05 and arousal_var < 0.1:
            arc_type = "Builder"
        elif arousal_slope < -0.05 and arousal_var < 0.1:
            arc_type = "Fader"
        elif valence_var > 0.15:
            arc_type = "Rollercoaster"
        elif arousal_var < 0.05 and valence_var < 0.05:
            arc_type = "Steady"
        elif has_spike:
            arc_type = "Explosive"
        else:
            arc_type = "Dynamic"

        metrics = {
            'arousal_slope': float(arousal_slope),
            'valence_slope': float(valence_slope),
            'arousal_var': float(arousal_var),
            'valence_var': float(valence_var)
        }

        return arc_type, metrics

    def process_track(self, audio, sr):

        static = self.extract_static_features(audio, sr)
        temporal = self.extract_temporal_features(audio, sr)

        # Combine into single dict
        return {**static, **temporal}

# Initialize extractor
print("Initializing EmotionalArcExtractor...")
extractor = EmotionalArcExtractor()
print("Extractor ready to process tracks")

In [None]:
# Test on track ID 2 
print("Testing feature extraction on Track ID 2...")

# Load full 30-second clip this time
y, sr = librosa.load(loader.get_path(2), sr=22050, duration=30)
print(f"Loaded: {len(y)/sr:.1f} seconds at {sr} Hz")

# Extract features
print("\n⏳ Extracting features...")
features = extractor.process_track(y, sr)

# Display results
print("\n" + "="*60)
print("EXTRACTED FEATURES:")
print("="*60)
print(f"\n Track: {tracks.loc[2, ('track', 'title')]}")
print(f"Genre: {tracks.loc[2, ('track', 'genre_top')]}")

print(f"\n STATIC FEATURES:")
print(f"  • CLAP embedding shape: {features['clap_embedding'].shape}")
print(f"  • Tempo: {features['tempo']:.1f} BPM")
print(f"  • Spectral Centroid: {features['spectral_centroid_mean']:.1f} Hz")
print(f"  • RMS Energy: {features['rms_mean']:.4f}")

print(f"\n TEMPORAL FEATURES (THE NOVEL PART):")
print(f"  • Arc Type: {features['arc_type']}")
print(f"  • Arousal Slope: {features['arc_slope_arousal']:.4f}")
print(f"  • Valence Slope: {features['arc_slope_valence']:.4f}")
print(f"  • Arousal Variance: {features['arc_variance_arousal']:.4f}")
print(f"  • Valence Variance: {features['arc_variance_valence']:.4f}")

print(f"\n VALENCE TRAJECTORY (10 windows):")
print("  " + " → ".join([f"{v:.2f}" for v in features['valence_trajectory']]))

print(f"\n AROUSAL TRAJECTORY (10 windows):")
print("  " + " → ".join([f"{v:.2f}" for v in features['arousal_trajectory']]))

print("\n" + "="*60)
print(" FEATURE EXTRACTION WORKING PERFECTLY")
print("="*60)