In [None]:
import os
from sklearn.model_selection import train_test_split

# Load file paths and labels from /data/
data_path = "/data"
emotions = os.listdir(data_path)  # Assume subfolder names are emotion labels (e.g., "happy", "sad")
file_paths, labels = [], []

for emotion in emotions:
    emotion_dir = os.path.join(data_path, emotion)
    for file in os.listdir(emotion_dir):
        if file.endswith(".wav"):  # Adjust extension if needed
            file_paths.append(os.path.join(emotion_dir, file))
            labels.append(emotion)

# Stratified split (80% train, 20% test)
X_train_files, X_test_files, y_train, y_test = train_test_split(
    file_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

In [None]:
import librosa
import numpy as np
import pandas as pd

def extract_features(file_path):
    y, sr = librosa.load(file_path)
    
    # MFCC Features (mean, var, delta, delta-delta)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_var = np.var(mfcc, axis=1)
    delta = librosa.feature.delta(mfcc)
    delta_mean = np.mean(delta, axis=1)
    delta_delta = librosa.feature.delta(mfcc, order=2)
    delta_delta_mean = np.mean(delta_delta, axis=1)
    
    # Prosodic Features (pitch and intensity)
    f0 = librosa.yin(y, fmin=20, fmax=8000, sr=sr)
    pitch_mean, pitch_var = np.mean(f0), np.var(f0)
    rms = librosa.feature.rms(y=y)
    intensity_mean, intensity_var = np.mean(rms), np.var(rms)
    
    # Chroma Features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    chroma_var = np.var(chroma, axis=1)
    
    # Combine all features
    features = np.concatenate([
        mfcc_mean, mfcc_var, delta_mean, delta_delta_mean,
        [pitch_mean, pitch_var, intensity_mean, intensity_var],
        chroma_mean, chroma_var
    ])
    return features

# Example usage for all files in the training folder:
features_list = []
for file in training_files:
    features = extract_features(file)
    features_list.append(features)

df = pd.DataFrame(features_list)
df.to_csv("extracted_features.csv", index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation with labels (assuming labels are numeric)
corr_matrix = df.corrwith(y_train).abs().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=corr_matrix.index, y=corr_matrix.values)
plt.title("Feature Correlation with Emotion Labels")

# Variance analysis per emotion
plt.figure(figsize=(12, 6))
sns.boxplot(x="emotion", y="mfcc_mean_0", data=df_with_labels)
plt.title("MFCC Mean Distribution by Emotion")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  # Retain 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Plot first two components
sns.scatterplot(x=X_train_pca[:, 0], y=X_train_pca[:, 1], hue=y_train)
plt.title("PCA Projection of Features")

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=n_emotions, random_state=42)
clusters = kmeans.fit_predict(X_train_pca)

# Evaluate ARI
from sklearn.metrics import adjusted_rand_score
ari_kmeans = adjusted_rand_score(y_train, clusters)
print(f"ARI for K-Means: {ari_kmeans:.2f}")

# Majority voting for cluster labels
from collections import Counter

cluster_to_label = {}
for cluster in range(n_emotions):
    labels = y_train[clusters == cluster]
    cluster_to_label[cluster] = Counter(labels).most_common(1)[0][0]

In [None]:
# Simplified DBSCAN implementation (full code requires neighbor functions)
def dbscan(X, eps, min_samples):
    # Core logic here
    return labels

dbscan_labels = dbscan(X_train_pca, eps=0.5, min_samples=5)
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"DBSCAN found {n_clusters} clusters")

In [None]:
# Predict test clusters using K-Means
test_clusters = kmeans.predict(X_test_pca)
predicted_labels = [cluster_to_label[c] for c in test_clusters]

# Calculate accuracy
accuracy = np.mean(predicted_labels == y_test)
print(f"Test Accuracy: {accuracy:.2f}")