In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


In [None]:
# data loading
ROOT = Path("..").resolve()
single = pd.read_csv(ROOT/"outputs/features_single.csv")
melod  = pd.read_csv(ROOT/"outputs/features_melodies.csv")

In [34]:
def split_groups(df):
    '''
    Split features into different groups.
    '''
    temporal = ["rms","attack_time_s","onset_rate_s"]
    spectral = [c for c in df.columns if any(k in c for k in ["centroid_","bandwidth_","rolloff_","flatness_","flux_","zcr_","contrast_"])]
    percept  = ["hnr_proxy_db"] + [c for c in df.columns if c.startswith("mfcc") or c.startswith("chroma")]
    meta = ["instrument","file"]
    return temporal, spectral, percept, meta

temporal, spectral, percept, meta = split_groups(melod)
features_all = temporal + spectral + percept

In [12]:
# Replace infs and prepare matrix
X = melod[features_all].replace([np.inf, -np.inf], np.nan).copy()
y = melod["instrument"].copy()

# 1) Impute NaNs (median is robust)
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)

# 2) Standardize
scaler = StandardScaler()
Xz = scaler.fit_transform(X_imp)

In [13]:
# 3) PCA
n_comp = min(6, Xz.shape[1])  # guard: can't ask > n_features
pca = PCA(n_components=n_comp, random_state=0).fit(Xz)
print("Explained variance ratio:", np.round(pca.explained_variance_ratio_, 3))

# Loadings table
loadings = pd.DataFrame(
    pca.components_,
    columns=features_all,
    index=[f"PC{i+1}" for i in range(n_comp)]
)
loadings.T.sort_values("PC1", key=np.abs, ascending=False).head(15)


Explained variance ratio: [0.428 0.139 0.093 0.066 0.046 0.031]


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
contrast_std,0.185199,0.005858,-0.03806,0.039557,-0.089463,-0.02523
bandwidth_mean,0.185147,0.07099,0.070102,0.003996,-0.005453,0.000152
bandwidth_median,0.185012,0.066294,0.077374,0.005163,-0.005837,-0.004525
flux_mean,0.183415,0.045173,0.085658,-0.067469,0.000638,-0.022552
flux_median,0.183203,0.074857,0.015492,-0.06645,-0.051027,-0.031534
mfcc4_mean,-0.182909,0.007068,0.077049,-0.037238,0.045519,0.068566
zcr_mean,0.176888,0.089456,0.106759,0.003948,-0.031228,0.038612
centroid_mean,0.176637,0.094037,0.107413,0.023492,-0.008213,0.03021
zcr_median,0.176265,0.08839,0.113006,0.003376,-0.035585,0.0277
centroid_median,0.176234,0.092075,0.112765,0.021885,-0.008987,0.02764
