## Get aligned MusicGen predictions

In [None]:
%cd /home/DAVIDSON/dutuller/Workspace/DRI1/MusicGen/

from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import yaml
from embeddings.h5_processor import H5DataProcessor, DatasetConfig, ProcessedDataset
import pandas as pd
import re

Infrastructure for loading and splitting the embedding data from storage.

In [None]:
with open("universal_music/NHS_full.yaml", 'r') as f:
    config = yaml.safe_load(f)

# Process datasets using H5DataProcessor
processor = H5DataProcessor()
all_train_data = []
all_test_data = []
class_names = set()

# Process each dataset and split
for dataset_config in config['datasets']:
    dataset = processor.process_h5_file(
        processor.get_embedding_path(DatasetConfig(**dataset_config)),
        DatasetConfig(**dataset_config)
    )
    
    # Split the dataset
    train_data, test_data = processor.get_train_test_split(
        dataset, 
        test_ratio=0.2,
        random_seed=42
    )
    
    all_train_data.append(train_data)
    all_test_data.append(test_data)
    class_names.update(dataset.labels)

# Combine datasets
train_data = ProcessedDataset(
    embeddings=np.vstack([d.embeddings for d in all_train_data]),
    labels=[l for d in all_train_data for l in d.labels],
    filenames=[f for d in all_train_data for f in d.filenames],
    name="combined",
    num_samples=sum(d.num_samples for d in all_train_data)
)

test_data = ProcessedDataset(
    embeddings=np.vstack([d.embeddings for d in all_test_data]),
    labels=[l for d in all_test_data for l in d.labels],
    filenames=[f for d in all_test_data for f in d.filenames],
    name="combined",
    num_samples=sum(d.num_samples for d in all_test_data)
)

# Create and configure model
model = LogisticRegression(max_iter=1000)

Train the classifier on the full songs from the train set.

In [None]:
# Create label mapping for string class labels
unique_labels = sorted(set(train_data.labels + test_data.labels))
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}

# Convert labels to indices
X_train = train_data.embeddings
y_train = np.array([label_to_idx[label] for label in train_data.labels])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train model
model.fit(X_train_scaled, y_train);

Identify which song ids are present in the test set, so that we can find them among the sample audio.

In [None]:
test_song_ids = [int(re.search(r"Discography-(\d+)_\d+.wav", filename).group(1)) for filename in test_data.filenames]
test_unique_song_ids = np.unique(test_song_ids)

Now load in the audio samples that the human survey participants actually listened to

In [None]:
# Get the embeddings of sample clips (14s)
with open("universal_music/NHS_samples.yaml", 'r') as f:
    config = yaml.safe_load(f)


dataset_config = DatasetConfig(**config['datasets'][0])
embedding_filename = processor.get_embedding_path(dataset_config)
dataset = processor.process_h5_file(embedding_filename, dataset_config)

# Select the ones that align with the test set above
sample_filenames = dataset.filenames
sample_embeddings = dataset.embeddings

Search for the songs selected to be in the test set from the sample audio dataset and prepare for model eval

In [None]:
sample_test_embeddings = []
sample_song_ids = []
sample_test_labels = []

for i, filename in enumerate(sample_filenames):
    id = int(re.search(r"NAIV-(\d+).wav", filename).group(1))
    if id in test_unique_song_ids:
        sample_test_embeddings.append(sample_embeddings[i])
        sample_song_ids.append(id)

X_test_sample = np.array(sample_test_embeddings)
X_test_scaled = scaler.transform(X_test_sample)

Get the ground truth function of the full test audio, then aggregate so that there is one label per song. This part will not be necessary when comparing directly to human ratings

In [None]:
def find_id(filename):
    return int(re.search(r"Discography-(\d+)_\d+.wav", filename).group(1))

test_data_info = pd.DataFrame(data=zip(test_data.labels, test_data.filenames), columns=['labels','filenames'])
test_data_info['id'] = test_data_info.filenames.apply(find_id)
labels = test_data_info.groupby('id').first()

y_true_sample = np.array([label_to_idx[label] for label in labels.labels])

In [None]:
y_pred_sample = model.predict(X_test_scaled)

Load the human ratings, add in the prediction according to each of the three policies, and filter to song ids present in the test set.

In [None]:
df = pd.read_csv('universal_music/FFfull.csv', low_memory=False)
web_df = df[df['study'] == 'web'].copy()

# Add 'predictions' from each of the policies
web_df['generous'] = np.load("universal_music/web_survey_ratings_generous.npy")
web_df['random']   = np.load("universal_music/web_survey_ratings_random.npy")
web_df['strict']   = np.load("universal_music/web_survey_ratings_strict.npy")

# Filter to only show songs in the test set
web_df = web_df[web_df['song'].isin(sample_song_ids)]

In [None]:
id_to_pred = dict(zip(sample_song_ids, y_pred_sample))
web_df['model_pred'] = web_df['song'].map(id_to_pred)
sample_predictions = web_df[['song', 'generous', 'random', 'strict', 'model_pred']]