<a href="https://colab.research.google.com/github/Aadil404/Music-Emotion-Recognition/blob/main/notebooks/02_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Navigate to your project directory (adjust the path if needed)
%cd /content/drive/MyDrive/MER(final-year-project)/

# 3. Install necessary libraries
!pip install librosa tensorflow pandas scikit-learn matplotlib seaborn

Mounted at /content/drive
/content/drive/MyDrive/MER(final-year-project)


In [2]:
import pandas as pd
import numpy as np
import librosa
from tqdm import tqdm
import os

# --- 1. Load the Cleaned Multi-Label Metadata ---
df = pd.read_csv('data/emotify_dataset/cleaned_metadata_weighted_probabilities.csv')
emotion_columns = df.columns[1:] # All columns except 'songs_path'

# --- 2. Feature Extraction (Audio -> Spectrogram) with Global Normalization ---
def create_mel_spectrogram_segments(audio_path, segment_length=5, hop_length=2.5, target_shape=(128, 216)):
    """
    Create multiple mel spectrogram segments from an audio file.
    Uses GLOBAL scaling (-80dB to 0dB) to preserve volume dynamics.
    """
    try:
        # Load audio (librosa automatically normalizes audio to -1 to 1 float)
        y, sr = librosa.load(audio_path, duration=60)

        segments = []

        # Calculate segment parameters
        segment_samples = int(segment_length * sr)
        hop_samples = int(hop_length * sr)
        total_samples = len(y)

        # Create overlapping segments
        for start in range(0, total_samples - segment_samples + 1, hop_samples):
            end = start + segment_samples
            segment = y[start:end]

            # 1. Create mel spectrogram
            mel_spec = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=2048, hop_length=512, n_mels=128)

            # 2. Convert to Log-Mel (dB)
            # ref=1.0 ensures we are measuring absolute 'loudness' relative to digital full scale,
            # NOT relative to the peak of this specific segment.
            mel_spec_db = librosa.power_to_db(mel_spec, ref=1.0)

            # 3. GLOBAL Normalization (The Fix)
            # Music typically ranges from -80dB (silence) to 0dB (max volume).
            # We clip values to this range and scale to 0-1.
            min_db = -80.0
            max_db = 0.0

            mel_spec_db = np.clip(mel_spec_db, min_db, max_db)
            mel_spec_db = (mel_spec_db - min_db) / (max_db - min_db)  # Scale to 0-1 range

            # 4. Resize to target shape (padding if needed)
            if mel_spec_db.shape[1] < target_shape[1]:
                pad_width = target_shape[1] - mel_spec_db.shape[1]
                mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
            else:
                mel_spec_db = mel_spec_db[:, :target_shape[1]]

            segments.append(mel_spec_db)

        return segments

    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return None

In [3]:
X = [] # To store spectrogram segments
y = [] # To store multi-label vectors
segment_song_indices = [] # ðŸ†• NEW: Track which song each segment comes from

EXPECTED_SHAPE = (128, 216)

print("Creating 5-second segments with 50% overlap...")

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    audio_path = 'data/emotify_dataset/' + row['songs_path']
    spectrogram_segments = create_mel_spectrogram_segments(audio_path, target_shape=EXPECTED_SHAPE)

    if spectrogram_segments is not None:
        # For each segment, add to X and repeat the same label
        for segment in spectrogram_segments:
            X.append(segment)
            y.append(row[emotion_columns].values)
            segment_song_indices.append(index)  # ðŸ†• Store which song this segment belongs to

print(f"Total segments created: {len(X)}")
print(f"Original songs: {len(df)}")

# Convert to numpy arrays
X = np.array(X)
y = np.array(y, dtype='float32')
segment_song_indices = np.array(segment_song_indices)  # ðŸ†• Convert to array

# Add channel dimension for the CNN
X = X[..., np.newaxis]

print(f"Final X shape: {X.shape}")
print(f"Final y shape: {y.shape}")

Creating 5-second segments with 50% overlap...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 400/400 [11:02<00:00,  1.66s/it]


Total segments created: 9152
Original songs: 400
Final X shape: (9152, 128, 216, 1)
Final y shape: (9152, 9)


In [4]:
# Split the data while preventing song leakage
from sklearn.model_selection import train_test_split
import numpy as np

# ðŸ†• Use the precomputed segment_song_indices from Cell 1
unique_songs = np.unique(segment_song_indices)
train_songs, test_songs = train_test_split(unique_songs, test_size=0.2, random_state=42)

# Create masks for segments
train_mask = np.isin(segment_song_indices, train_songs)
test_mask = np.isin(segment_song_indices, test_songs)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Training segments: {len(X_train)}")
print(f"Testing segments: {len(X_test)}")
print(f"Training songs: {len(train_songs)}")
print(f"Testing songs: {len(test_songs)}")

Training segments: 7346
Testing segments: 1806
Training songs: 320
Testing songs: 80


In [5]:
# --- Save for Training ---
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

np.savez_compressed(
    os.path.join(output_dir, 'emotify_spectrograms_5s_segments.npz'),
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    train_songs=train_songs, test_songs=test_songs,
    segment_song_indices=segment_song_indices  # ðŸ†• Save this for reference
)

print("âœ… Preprocessing complete. 5-second segment data saved.")

âœ… Preprocessing complete. 5-second segment data saved.


In [6]:
X_train[0]

array([[[0.21711521],
        [0.6955112 ],
        [0.86634284],
        ...,
        [1.        ],
        [1.        ],
        [1.        ]],

       [[0.21711521],
        [0.71478605],
        [0.891801  ],
        ...,
        [1.        ],
        [1.        ],
        [1.        ]],

       [[0.21711521],
        [0.74457335],
        [0.903258  ],
        ...,
        [1.        ],
        [1.        ],
        [1.        ]],

       ...,

       [[0.21711521],
        [0.21711521],
        [0.32219988],
        ...,
        [0.21711521],
        [0.37285453],
        [0.5150104 ]],

       [[0.21711521],
        [0.21711521],
        [0.2948412 ],
        ...,
        [0.21711521],
        [0.37253457],
        [0.51486343]],

       [[0.21711521],
        [0.21711521],
        [0.21711521],
        ...,
        [0.21711521],
        [0.37227702],
        [0.5146855 ]]], dtype=float32)

In [7]:
y_train[:10]

array([[0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
       [0.18181819, 0.        , 0.09090909, 0.        , 0.09090909,
        0.45454547, 0.09090909, 0.72727275, 0.27272728],
