In [15]:
# ================================
# Feature Extraction for XGBoost (Flattened MFCCs)
# ================================

#  Imports
import pandas as pd
import numpy as np
import librosa
import os
from sklearn.preprocessing import LabelEncoder
import joblib

#  Config
METADATA_PATH = "/content/drive/MyDrive/capstone_data/combined_clean_metadata.csv"
X_SAVE_PATH = "/content/drive/MyDrive/capstone_data/x_xgb_features.npy"
Y_SAVE_PATH = "/content/drive/MyDrive/capstone_data/y_xgb_labels.npy"
ENCODER_PATH = "/content/drive/MyDrive/capstone_data/label_encoder.pkl"
MAX_PAD_LEN = 174  # ~4 seconds of audio

# Load metadata
df = pd.read_csv(METADATA_PATH)
print(f"[INFO] Loaded metadata: {df.shape[0]} rows")

# Feature extraction function
def extract_flat_mfcc(file_path, max_pad_len=174):
    try:
        audio, sr = librosa.load(file_path)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        pad_width = max_pad_len - mfcc.shape[1]
        if pad_width > 0:
            mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]
        return mfcc.flatten()
    except Exception as e:
        print(f"[WARN] Skipped file {file_path}: {e}")
        return None

#  Extract features
X = []
y = []

for idx, row in df.iterrows():
    features = extract_flat_mfcc(row['file_path'], MAX_PAD_LEN)
    if features is not None:
        X.append(features)
        y.append(row['emotion'])

X = np.array(X)
y = np.array(y)

print(f"[INFO] Extracted features: X shape = {X.shape}, y shape = {y.shape}")

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

#  Save features and encoder
np.save(X_SAVE_PATH, X)
np.save(Y_SAVE_PATH, y_encoded)
joblib.dump(le, ENCODER_PATH)

print(f"[✅] Features saved to: {X_SAVE_PATH}")
print(f"[✅] Labels saved to: {Y_SAVE_PATH}")
print(f"[✅] Label encoder saved to: {ENCODER_PATH}")
print(f"[INFO] Classes: {le.classes_}")

[INFO] Loaded metadata: 6244 rows
[INFO] Extracted features: X shape = (6244, 6960), y shape = (6244,)
[✅] Features saved to: /content/drive/MyDrive/capstone_data/x_xgb_features.npy
[✅] Labels saved to: /content/drive/MyDrive/capstone_data/y_xgb_labels.npy
[✅] Label encoder saved to: /content/drive/MyDrive/capstone_data/label_encoder.pkl
[INFO] Classes: ['angry' 'happy' 'neutral' 'sad']
