In [None]:
!pip install torch torchaudio transformers librosa matplotlib numpy --quiet

In [None]:
# ============================================================
# üéµ Music Emotion Extraction Simulation using MERT Embeddings
# ============================================================

# --- 1Ô∏è‚É£ Install dependencies (run once in Colab)
# !pip install torch torchaudio transformers librosa matplotlib numpy --quiet

import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoProcessor, AutoModel

# ------------------------------------------------------------
# STEP 1: Load audio file
# ------------------------------------------------------------
AUDIO_PATH = "/content/Pharrell Williams - Happy (Official Video).mp3"

# Load waveform and sampling rate
waveform, sr = torchaudio.load(AUDIO_PATH)

# Convert stereo ‚Üí mono by averaging channels
waveform = waveform.mean(dim=0)

print(f"‚úÖ Audio loaded: {AUDIO_PATH}, Duration = {waveform.shape[0]/sr:.1f}s @ {sr}Hz")

# ------------------------------------------------------------
# STEP 2: Resample to 24000 Hz (required by MERT)
# ------------------------------------------------------------
target_sr = 24000
if sr != target_sr:
    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=target_sr)
    sr = target_sr
print(f"‚úÖ Audio resampled to {sr}Hz")

# ------------------------------------------------------------
# STEP 3: Load pretrained MERT model and processor
# ------------------------------------------------------------
processor = AutoProcessor.from_pretrained("m-a-p/MERT-v1-330M")
model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M")
model.eval()
print("‚úÖ MERT model loaded")

# ------------------------------------------------------------
# STEP 4: Prepare model input
# ------------------------------------------------------------
# MERT expects argument `raw_speech` with waveform and sampling rate
inputs = processor(raw_speech=waveform, sampling_rate=sr, return_tensors="pt")

# ------------------------------------------------------------
# STEP 5: Extract high-level audio embedding
# ------------------------------------------------------------
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    # Take the last hidden layer representation
    hidden_states = outputs.hidden_states[-1]
    # Average across time dimension to get one embedding per song
    emb = hidden_states.mean(dim=1).squeeze().numpy()

print("üîπ Embedding shape:", emb.shape)

# ------------------------------------------------------------
# STEP 6: Normalize embedding vector
# ------------------------------------------------------------
emb = emb / np.linalg.norm(emb)

# ------------------------------------------------------------
# STEP 7: Simulate ‚Äúemotion prototypes‚Äù
# ------------------------------------------------------------
# NOTE: These are RANDOM VECTORS, not real emotional representations.
# In a real system, these would come from learned emotion centroids (e.g., trained on DEAM).

np.random.seed(0)
labels = ["Happy", "Sad", "Calm", "Energetic", "Tense", "Romantic"]

# Create random prototype vectors for each emotion
protos = {l: np.random.randn(emb.size) for l in labels}

# Normalize each prototype
for l in labels:
    protos[l] /= np.linalg.norm(protos[l])

# Compute cosine similarity between audio embedding and each prototype
sims = {l: np.dot(emb, protos[l]) for l in labels}

# Convert to pseudo-probabilities via softmax
probs = np.exp(list(sims.values()))
probs /= probs.sum()
dist = dict(zip(labels, probs))

# ------------------------------------------------------------
# STEP 8: Visualize simulated emotion distribution
# ------------------------------------------------------------
plt.figure(figsize=(8,4))
plt.bar(dist.keys(), dist.values(), color="orchid")
plt.title("Estimated Emotion Distribution (Simulated MERT model)")
plt.ylabel("Probability")
plt.grid(axis="y", alpha=0.3)
plt.show()

# Print dominant simulated emotion
print(f"üé∂ Simulated dominant emotion: {max(dist, key=dist.get)}")


Rezultatul nu are semnifica»õie psihologicƒÉ sau emo»õionalƒÉ realƒÉ.

‚ÄúEmo»õiile‚Äù sunt aleatoare, generate doar pentru a ilustra procesul de comparare.

Pentru detec»õie realƒÉ a emo»õiilor, este nevoie de:

date etichetate (ex: DEAM),

»ôi antrenarea unui model de regresie sau clasificare pe acele etichete.