In [19]:
# Training baseline model M0 for all 3 datasets

# 0  Common imports & helper functions  (first cell)

In [20]:
# ---------------------------------------------------------------------
# baseline_notebook.py  –  PMF‑IAT feature baseline (Jupyter version)
# ---------------------------------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score, precision_score, recall_score, accuracy_score


In [21]:
def evaluate_auc(clf, x_te, x_tr, y_tr, y_te, labels, weird_probas=False):
    global feat_set_name
    """Compute per‑label and mean ROC‑AUC (Jamendo / MagnaTagATune)."""
    preds_te = clf.predict_proba(x_te)
    preds_tr = clf.predict_proba(x_tr)
    auc_tr, auc_te = [], []
    for i, tag in enumerate(labels):
        p_tr = preds_tr[:, i] if weird_probas else preds_tr[i][:, 1]
        p_te = preds_te[:, i] if weird_probas else preds_te[i][:, 1]
        auc_tr.append(roc_auc_score(y_tr[tag], p_tr))
        auc_te.append(roc_auc_score(y_te[tag], p_te))
    print("Mean AUC  (train):", np.mean(auc_tr))
    print("Mean AUC  (test) :", np.mean(auc_te))

    # build and return DataFrame
    df = pd.DataFrame([{
        'feat_set_name': feat_set_name,
        'auc_tr': np.mean(auc_tr),
        'auc_te': np.mean(auc_te)
    }])

    return df


# 1  Feature‑set definitions  (second cell)

In [22]:
# --- Essentia 23 ‑ low/mid‑level signal descriptors ------------------
E23 = [
    'Danceability', 'Loudness', 'Chords-Changes-Rate', 'Dynamic-Complexity',
    'Zerocrossingrate', 'Chords-Number-Rate', 'Pitch-Salience',
    'Spectral-Centroid', 'Spectral-Complexity', 'Spectral-Decrease',
    'Spectral-Energyband-High', 'Spectral-Energyband-Low',
    'Spectral-Energyband-Middle-High', 'Spectral-Energyband-Middle-Low',
    'Spectral-Entropy', 'Spectral-Flux', 'Spectral-Rolloff',
    'Spectral-Spread', 'Onset-Rate', 'Length', 'BPM', 'Beats-Loud',
    'Vocal-Instrumental'
]

# --- Mid‑level perceptual 7 ------------------------------------------
ML7 = [
    'Melody', 'Articulation', 'Rhythm Complexity', 'Rhythm Stability',
    'Dissonance', 'Atonality', 'Mode'
]

# --- Symbolic / harmony 32 -------------------------------------------
SYM32 = [
    'Dominants', 'Subdominants', 'sub-sub', 'sub-dom', 'dom-sub',
    'dom-tonic', 'glob-sub', 'glob-dom', 'sub-sub-dom', 'sub-dom-sub',
    'dom-sub-dom', 'sub-dom-tonic', 'dom-tonic-sub', 'dom-sub-sub',
    'sub-sub-sub', 'glob-sub-glob', 'glob-dom-tonic', 'glob-sub-sub',
    'dom-dom', 'glob-glob', 'dom-dom-sub', 'glob-glob-dom',
    'glob-dom-glob', 'glob-glob-sub', 'dom-dom-tonic', 'glob-sub-dom',
    'dom-tonic-dom', 'glob-dom-sub', 'sub-dom-dom', 'dom-dom-dom',
    'glob-dom-dom', 'glob-glob-glob'
]

ALL62 = ML7 + SYM32 + E23      # full perceptual set

all = ['Melody','Articulation','Rhythm Complexity','Rhythm Stability', 'Dissonance', 'Atonality', 'Mode', 
    'Dominants', 'Subdominants', 'sub-sub', 'sub-dom', 'dom-sub', 'dom-tonic', 'glob-sub',  'glob-dom', 
    'sub-sub-dom', 'sub-dom-sub', 'dom-sub-dom', 'sub-dom-tonic', 'dom-tonic-sub', 
    'dom-sub-sub', 'sub-sub-sub', 'glob-sub-glob','glob-dom-tonic', 'glob-sub-sub', 'dom-dom', 'glob-glob',  'dom-dom-sub', 'glob-glob-dom', 'glob-dom-glob', 
    'glob-glob-sub',  'dom-dom-tonic', 'glob-sub-dom',  'dom-tonic-dom',  'glob-dom-sub', 'sub-dom-dom',  'dom-dom-dom','glob-dom-dom', 'glob-glob-glob',  'Danceability','Loudness','Chords-Changes-Rate','Dynamic-Complexity','Zerocrossingrate','Chords-Number-Rate'
    ,'Pitch-Salience','Spectral-Centroid','Spectral-Complexity','Spectral-Decrease','Spectral-Energyband-High',
    'Spectral-Energyband-Low','Spectral-Energyband-Middle-High','Spectral-Energyband-Middle-Low','Spectral-Entropy','Spectral-Flux','Spectral-Rolloff','Spectral-Spread','Onset-Rate','Length','BPM',
    'Beats-Loud', 'Vocal-Instrumental']


# 2  Global hyper‑parameters  (third cell)

In [31]:
# ---------------------------------------------------------------------
# Choose feature subset for baseline:
#   feat_set = E23        # -> “signal‑processing only”  (M0 of E1)
#   feat_set = ALL62      # -> full perceptual set       (M0 of E2)
# ---------------------------------------------------------------------
feat_set = E23        # <‑‑ change here for E1 vs E2
if feat_set == ALL62:
    feat_set_name = "ALL62"
elif feat_set == E23:
    feat_set_name = "E23"
# xgb_params_binary = dict(
#     max_depth=3, learning_rate=0.1, n_estimators=70,
#     gamma=7.56, min_child_weight=6,
#     objective='binary:logistic', eval_metric='auc'
# )
# xgb_params_multi = dict(
#     max_depth=2, learning_rate=0.3, objective='multi:softmax',
#     num_class=10, importance_type='weight'
# )
xgb_gpu_binary = dict(
    tree_method='hist', device='cuda',      # use CUDA kernels  :contentReference[oaicite:2]{index=2}
    n_estimators=70,
    max_depth=3, learning_rate=0.1,
    gamma=7.56, min_child_weight=6,
    objective='binary:logistic', eval_metric='auc'
)

xgb_gpu_multi = dict(
    tree_method='hist', device='cuda',
    max_depth=2, learning_rate=0.3,
    objective='multi:softmax', num_class=10,     # GTZAN   :contentReference[oaicite:4]{index=4}
    importance_type='weight'
)

lgb_gpu_params = dict(
    boosting_type='gbdt',
    device='gpu',                # turn on GPU  :contentReference[oaicite:5]{index=5}
    gpu_platform_id=-1,          # -1 -> first platform  :contentReference[oaicite:6]{index=6}
    gpu_device_id=0,             # or pick a specific card
    n_estimators=300,
    max_depth=-1,
    learning_rate=0.05,
    objective='binary',
    metric='auc'
)

RANDOM_STATE = 4
DATA_ROOT = Path('D:/ICASSP1_GPMusic')   # adjust if CSVs live elsewhere


# 3  MTG‑Jamendo workflow  (fourth cell)

In [24]:
# -------------------------- Jamendo (multi‑label) --------------------
train_csv = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/train.csv'
val_csv   = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/validation.csv'
test_csv  = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/test.csv'

# Load + merge splits
df_train = pd.read_csv(train_csv).drop(columns=['Track'])
df_val   = pd.read_csv(val_csv).drop(columns=['Track'])
df_test  = pd.read_csv(test_csv).drop(columns=['Track'])

x_tr = pd.concat([df_train[feat_set], df_val[feat_set]])
y_tr = pd.concat([df_train.drop(columns=all),
                  df_val.drop(columns=all)])
x_te = df_test[feat_set]
y_te = df_test.drop(columns=all)
labels = y_tr.columns.tolist()

# Optional scaling (recommended)
scaler = StandardScaler().fit(x_tr)
x_tr, x_te = scaler.transform(x_tr), scaler.transform(x_te)

# # Train
# base_est = xgb.XGBClassifier(**xgb_params_binary)
# jamendo_clf = MultiOutputClassifier(base_est).fit(x_tr, y_tr)
# evaluate_auc(jamendo_clf, x_te, x_tr, y_tr, y_te, labels)

# --- GPU XGBoost
jamendo_xgb = MultiOutputClassifier(xgb.XGBClassifier(**xgb_gpu_binary))
jamendo_xgb.fit(x_tr, y_tr)
df_jamendo_xgb = evaluate_auc(jamendo_xgb, x_te, x_tr, y_tr, y_te, labels)
df_jamendo_xgb.to_csv(DATA_ROOT / f'mtg-jamendo/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.csv')
df_jamendo_xgb.to_pickle(DATA_ROOT / f'mtg-jamendo/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.pkl')

# # --- GPU LightGBM
# lgb_gpu = lgb.LGBMClassifier(**lgb_gpu_params)
# jamendo_lgb = MultiOutputClassifier(lgb_gpu)
# jamendo_lgb.fit(x_tr, y_tr)
# evaluate_auc(jamendo_lgb, x_te, x_tr, y_tr, y_te, labels, weird_probas=True)


Mean AUC  (train): 0.8387044660573596
Mean AUC  (test) : 0.7191511378775602


In [None]:
# -------------------------- Jamendo (multi‑label) --------------------
# Your existing data loading code
train_csv = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/train.csv'
val_csv   = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/validation.csv'
test_csv  = DATA_ROOT / 'mtg-jamendo/Ref_Perceptual_features/test.csv'

# Load + merge splits
df_train = pd.read_csv(train_csv).drop(columns=['Track'])
df_val   = pd.read_csv(val_csv).drop(columns=['Track'])
df_test  = pd.read_csv(test_csv).drop(columns=['Track'])

x_tr = pd.concat([df_train[feat_set], df_val[feat_set]])
y_tr = pd.concat([df_train.drop(columns=all),
                    df_val.drop(columns=all)])
x_te = df_test[feat_set]
y_te = df_test.drop(columns=all)
labels = y_tr.columns.tolist()

# Optional scaling (recommended)
scaler = StandardScaler().fit(x_tr)
x_tr_scaled = scaler.transform(x_tr)
x_te_scaled = scaler.transform(x_te)

# Split training data for GP (use validation split for GP fitness evaluation)
split_idx = len(df_train)
x_train_gp = x_tr_scaled[:split_idx]
y_train_gp = y_tr.iloc[:split_idx]
x_val_gp = x_tr_scaled[split_idx:]
y_val_gp = y_tr.iloc[split_idx:]
x_test_gp = x_te_scaled
y_test_gp = y_te.values

# # Train
# base_est = xgb.XGBClassifier(**xgb_params_binary)
# jamendo_clf = MultiOutputClassifier(base_est).fit(x_tr, y_tr)
# evaluate_auc(jamendo_clf, x_te, x_tr, y_tr, y_te, labels)

# --- GPU XGBoost
jamendo_xgb = MultiOutputClassifier(xgb.XGBClassifier(**xgb_gpu_binary))
jamendo_xgb.fit(x_train_gp, y_train_gp)
df_jamendo_xgb = evaluate_auc(jamendo_xgb, x_val_gp, x_train_gp, y_train_gp, y_val_gp, labels)

# 4  MagnaTagATune workflow  (fifth cell)

In [25]:
# -------------------- MagnaTagATune (multi‑label 50 tags) ------------
mtt_csv = DATA_ROOT / 'magnatagatune/Ref_Perceptual_features/perceptual_features.csv'
music_tags = [
        "guitar", "classical", "slow", "techno", "strings", "drums", "electronic", 
        "rock", "fast", "piano", "ambient", "beat", "violin", "vocal", "synth", 
        "female", "indian", "opera", "male", "singing", "vocals", "no vocals", 
        "harpsichord", "loud", "quiet", "flute", "woman", "male vocal", "no vocal", 
        "pop", "soft", "sitar", "solo", "man", "classic", "choir", "voice", 
        "new age", "dance", "male voice", "female vocal", "beats", "harp", "cello", 
        "no voice", "weird", "country", "metal", "female voice", "choral"
    ]

df = pd.read_csv(mtt_csv).drop(columns=['Track'])
df.rename(columns={'dom': 'Dominants', 'sub': 'Subdominants'}, inplace=True)

feat_set_MTAT = [item for item in feat_set if item not in ['Length', 'Vocal-Instrumental']]

x = df[feat_set_MTAT]
y = df[music_tags]
labels = music_tags

x_tr, x_te, y_tr, y_te = train_test_split(
    x, y, train_size=0.8, random_state=RANDOM_STATE)

scaler = StandardScaler().fit(x_tr)
x_tr, x_te = scaler.transform(x_tr), scaler.transform(x_te)

# base_est = xgb.XGBClassifier(**xgb_params_binary)
# mtt_clf  = MultiOutputClassifier(base_est).fit(x_tr, y_tr)
# evaluate_auc(mtt_clf, x_te, x_tr, y_tr, y_te, labels)

# GPU XGBoost
mtt_xgb = MultiOutputClassifier(xgb.XGBClassifier(**xgb_gpu_binary)).fit(x_tr,y_tr)
df_mtt_xgb = evaluate_auc(mtt_xgb,x_te,x_tr,y_tr,y_te,labels)
df_mtt_xgb.to_csv(DATA_ROOT / f'magnatagatune/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.csv')
df_mtt_xgb.to_pickle(DATA_ROOT / f'magnatagatune/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.pkl')

# # GPU LightGBM
# mtt_lgb = MultiOutputClassifier(lgb.LGBMClassifier(**lgb_gpu_params)).fit(x_tr,y_tr)
# evaluate_auc(mtt_lgb,x_te,x_tr,y_tr,y_te,labels,weird_probas=True)

Mean AUC  (train): 0.8671650571982085
Mean AUC  (test) : 0.8345847319720278


# 5  GTZAN workflow  (sixth cell)


In [26]:
# ---------------------- GTZAN (single‑label 10 genres) ----------------
gtzan_csv = DATA_ROOT / 'GTZAN/Ref_Perceptual_features/perceptual_features.csv'
df = pd.read_csv(gtzan_csv).drop(columns=['Track'])
df.rename(columns={'dom': 'Dominants', 'sub': 'Subdominants'}, inplace=True)

feat_set_GTZAN = [item for item in feat_set if item not in ['Vocal-Instrumental']]
all_GTZAN = [item for item in all if item not in ['Vocal-Instrumental']]

x = df[feat_set_GTZAN]
y_1hot = df.drop(columns=all_GTZAN)        # 10 one‑hot genre cols
genre_labels = y_1hot.columns.tolist()

# Convert one‑hot row -> integer genre index
y_int = y_1hot.values.argmax(axis=1)

x_tr, x_te, y_tr, y_te = train_test_split(
    x, y_int, train_size=0.8, random_state=RANDOM_STATE)

scaler = StandardScaler().fit(x_tr)
x_tr, x_te = scaler.transform(x_tr), scaler.transform(x_te)

# gtzan_clf = xgb.XGBClassifier(**xgb_params_multi)
# gtzan_clf.fit(x_tr, y_tr)

# print("Accuracy (train):", accuracy_score(y_tr, gtzan_clf.predict(x_tr)))
# print("Accuracy (test) :", accuracy_score(y_te, gtzan_clf.predict(x_te)))

# GPU XGBoost
gtzan_xgb = xgb.XGBClassifier(**xgb_gpu_multi).fit(x_tr,y_tr)
print("XGB train acc:", accuracy_score(y_tr, gtzan_xgb.predict(x_tr)))
print("XGB test  acc:", accuracy_score(y_te, gtzan_xgb.predict(x_te)))

# build and return DataFrame
df_gtzan_xgb = pd.DataFrame([{
    'feat_set_name': feat_set_name,
    'acc_tr': np.mean(accuracy_score(y_tr, gtzan_xgb.predict(x_tr))),
    'acc_te': np.mean(accuracy_score(y_te, gtzan_xgb.predict(x_te)))
}])
df_gtzan_xgb.to_csv(DATA_ROOT / f'GTZAN/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.csv')
df_gtzan_xgb.to_pickle(DATA_ROOT / f'GTZAN/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.pkl')

XGB train acc: 0.9987484355444305
XGB test  acc: 0.74


In [32]:
# ---------------------- GTZAN (single‑label 10 genres) ----------------
gtzan_csv = DATA_ROOT / 'GTZAN/Ref_Perceptual_features/perceptual_features.csv'
df = pd.read_csv(gtzan_csv).drop(columns=['Track'])
df.rename(columns={'dom': 'Dominants', 'sub': 'Subdominants'}, inplace=True)

feat_set_GTZAN = [item for item in feat_set if item not in ['Vocal-Instrumental']]
all_GTZAN = [item for item in all if item not in ['Vocal-Instrumental']]

x = df[feat_set_GTZAN]
y_1hot = df.drop(columns=all_GTZAN)        # 10 one‑hot genre cols
genre_labels = y_1hot.columns.tolist()

# Convert one‑hot row -> integer genre index
y_int = y_1hot.values.argmax(axis=1)

X_temp, X_test, y_temp, y_test = train_test_split(
    x, y_int, train_size=0.8, random_state=RANDOM_STATE)

# 2) From the 80% “temp”, split out validation (20% of temp → 16% overall)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

# Optional scaling (recommended)
scaler = StandardScaler().fit(X_temp)
x_tr_scaled = scaler.transform(X_train)
x_val_scaled = scaler.transform(X_val)
x_te_scaled = scaler.transform(X_test)

x_train_gp = x_tr_scaled
y_train_gp = y_train
x_val_gp = x_val_scaled
y_val_gp = y_val
x_test_gp = x_te_scaled
y_test_gp = y_test

# gtzan_clf = xgb.XGBClassifier(**xgb_params_multi)
# gtzan_clf.fit(x_tr, y_tr)

# print("Accuracy (train):", accuracy_score(y_tr, gtzan_clf.predict(x_tr)))
# print("Accuracy (test) :", accuracy_score(y_te, gtzan_clf.predict(x_te)))

# GPU XGBoost
gtzan_xgb = xgb.XGBClassifier(**xgb_gpu_multi).fit(x_train_gp,y_train_gp)
print("XGB train acc:", accuracy_score(y_train_gp, gtzan_xgb.predict(x_train_gp)))
print("XGB val  acc:", accuracy_score(y_val_gp, gtzan_xgb.predict(x_val_gp)))
print("XGB test  acc:", accuracy_score(y_test_gp, gtzan_xgb.predict(x_test_gp)))
print(feat_set_name)

# # build and return DataFrame
# df_gtzan_xgb = pd.DataFrame([{
#     'feat_set_name': feat_set_name,
#     'acc_tr': np.mean(accuracy_score(y_tr, gtzan_xgb.predict(x_tr))),
#     'acc_te': np.mean(accuracy_score(y_te, gtzan_xgb.predict(x_te)))
# }])
# df_gtzan_xgb.to_csv(DATA_ROOT / f'GTZAN/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.csv')
# df_gtzan_xgb.to_pickle(DATA_ROOT / f'GTZAN/Ref_Perceptual_features/M0_results_xgb_{feat_set_name}.pkl')

XGB train acc: 0.9984350547730829
XGB val  acc: 0.69375
XGB test  acc: 0.705
E23
