In [1]:
!pip install datasets
!sudo apt install ffmpeg

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# Download dataset via Kaggle commandline API
!kaggle datasets download -d jorgeruizdev/ludwig-music-dataset-moods-and-subgenres
!unzip ludwig-music-dataset-moods-and-subgenres.zip -d ludwig

# MP3 to NPY (Mel-Spectrogram)
**For this part, you must upload the train.csv and test.csv files from the GitHub Repository**

In [3]:
STR_CLIP_ID = 'clip_id'
STR_AUDIO_SIGNAL = 'audio_signal'
STR_TARGET_VECTOR = 'target_vector'


STR_CH_FIRST = 'channels_first'
STR_CH_LAST = 'channels_last'

import io
import os
import tqdm
import logging
import subprocess
from typing import Tuple
from pathlib import Path

# import librosa
import numpy as np
import soundfile as sf

import itertools
from numpy.fft import irfft

def _resample_load_ffmpeg(path: str, sample_rate: int, downmix_to_mono: bool) -> Tuple[np.ndarray, int]:
    """
    Decoding, downmixing, and downsampling by librosa.
    Returns a channel-first audio signal.

    Args:
        path:
        sample_rate:
        downmix_to_mono:

    Returns:
        (audio signal, sample rate)
    """

    def _decode_resample_by_ffmpeg(filename, sr):
        """decode, downmix, and resample audio file"""
        channel_cmd = '-ac 1 ' if downmix_to_mono else ''  # downmixing option
        resampling_cmd = f'-ar {str(sr)}' if sr else ''  # downsampling option
        cmd = f"ffmpeg -i \"{filename}\" {channel_cmd} {resampling_cmd} -f wav -"
        p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = p.communicate()
        return out

    src, sr = sf.read(io.BytesIO(_decode_resample_by_ffmpeg(path, sr=sample_rate)))
    return src.T, sr


def _resample_load_librosa(path: str, sample_rate: int, downmix_to_mono: bool, **kwargs) -> Tuple[np.ndarray, int]:
    """
    Decoding, downmixing, and downsampling by librosa.
    Returns a channel-first audio signal.
    """
    src, sr = librosa.load(path, sr=sample_rate, mono=downmix_to_mono, **kwargs)
    return src, sr


def load_audio(
    path: str or Path,
    ch_format: str,
    sample_rate: int = None,
    downmix_to_mono: bool = False,
    resample_by: str = 'ffmpeg',
    **kwargs,
) -> Tuple[np.ndarray, int]:
    """A wrapper of librosa.load that:
        - forces the returned audio to be 2-dim,
        - defaults to sr=None, and
        - defaults to downmix_to_mono=False.

    The audio decoding is done by `audioread` or `soundfile` package and ultimately, often by ffmpeg.
    The resampling is done by `librosa`'s child package `resampy`.

    Args:
        path: audio file path
        ch_format: one of 'channels_first' or 'channels_last'
        sample_rate: target sampling rate. if None, use the rate of the audio file
        downmix_to_mono:
        resample_by (str): 'librosa' or 'ffmpeg'. it decides backend for audio decoding and resampling.
        **kwargs: keyword args for librosa.load - offset, duration, dtype, res_type.

    Returns:
        (audio, sr) tuple
    """
    if ch_format not in (STR_CH_FIRST, STR_CH_LAST):
        raise ValueError(f'ch_format is wrong here -> {ch_format}')

    if os.stat(path).st_size > 8000:
        if resample_by == 'librosa':
            src, sr = _resample_load_librosa(path, sample_rate, downmix_to_mono, **kwargs)
        elif resample_by == 'ffmpeg':
            src, sr = _resample_load_ffmpeg(path, sample_rate, downmix_to_mono)
        else:
            raise NotImplementedError(f'resample_by: "{resample_by}" is not supposred yet')
    else:
        raise ValueError('Given audio is too short!')
    return src, sr

In [4]:
"""
Code modified and adapted from https://github.com/seungheondoh/lp-music-caps/blob/main/lpmc/music_captioning/preprocessor.py
"""
import os
import multiprocessing
import numpy as np
import csv
import sys
from tqdm import tqdm

# hard coding hparams
DATASET_PATH = "/content/ludwig"
MUSIC_SAMPLE_RATE = 16000 # resampling rate of MERT
DURATION = 30 # whisper expected input length
DATA_LENGTH = int(MUSIC_SAMPLE_RATE * DURATION)

def get_all_audio_paths():
    # Directory where the mp3 files are stored
    root_dir = DATASET_PATH + "/mp3/mp3/"

    # This list will hold all the mp3 file paths
    mp3_paths = []

    # Walk through the directory and subdirectories
    for genre_dir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.mp3'):  # Filter for mp3 files
                mp3_paths.append(os.path.join(genre_dir, file))
    return mp3_paths

def get_audio_paths(audio_csv):
    test_paths = []
    csv.field_size_limit(sys.maxsize)
    with open(audio_csv, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            test_paths.append(row['id'])
    all_paths = get_all_audio_paths()
    print(f"Found {len(all_paths)} audio clips")
    audio_paths = [
      path for path in all_paths
      if path.rpartition('/')[-1].rpartition('.mp3')[0] in test_paths
    ]
    print(f"Loading {len(audio_paths)} audio clips")
    return audio_paths

def msd_resampler(sample):
    path = sample
    save_name = os.path.join(DATASET_PATH,'npy', path.rpartition('/')[-1].replace(".mp3",".npy"))
    try:
        src, _ = load_audio(
            path=path,
            ch_format= STR_CH_FIRST,
            sample_rate= MUSIC_SAMPLE_RATE,
            downmix_to_mono= True)
    except ValueError as err:
        print(f"{err} for {sample}")
        return
    if src.shape[-1] < DATA_LENGTH: # short case
        pad = np.zeros(DATA_LENGTH)
        pad[:src.shape[-1]] = src
        src = pad
    elif src.shape[-1] > DATA_LENGTH: # too long case
        src = src[:DATA_LENGTH]

    if not os.path.exists(os.path.dirname(save_name)):
        os.makedirs(os.path.dirname(save_name), exist_ok=True)
    np.save(save_name, src.astype(np.float32))

def main():
    all_samples = get_audio_paths("train.csv")
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        for _ in tqdm(pool.imap_unordered(msd_resampler, all_samples), total=len(all_samples)):
            pass
    print("finish extract")

if __name__ == '__main__':
    main()

Found 11294 audio clips
Loading 3268 audio clips


100%|██████████| 3268/3268 [03:59<00:00, 13.66it/s]

finish extract





In [5]:
!mv /content/ludwig/npy /content/ludwig/npy_train

In [6]:
def main():
    all_samples = get_audio_paths("test.csv")
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        for _ in tqdm(pool.imap_unordered(msd_resampler, all_samples), total=len(all_samples)):
            pass
    print("finish extract")

if __name__ == '__main__':
    main()

Found 11294 audio clips
Loading 318 audio clips


100%|██████████| 318/318 [00:23<00:00, 13.47it/s]

finish extract





In [7]:
!mv /content/ludwig/npy /content/ludwig/npy_test

# Dataset, Linear Probing Model, and Training Utilities

In [1]:
import torch
from torch import nn, optim
import torchaudio.transforms as T
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, Dataset
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import Wav2Vec2Processor
import torchaudio.transforms as T
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import hamming_loss

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def compute_metrics(p):
    # Convert predictions and labels to tensors
    logits, labels = p

    # Apply sigmoid to logits to get probabilities
    predictions = torch.sigmoid(torch.tensor(logits)).cpu().numpy()

    # Binarize the predictions to get 0 or 1 for multilabel classification
    predictions = (predictions > 0.3).astype(int)

    hamming_loss_val = hamming_loss(labels, predictions)
    hamming_score_val = hamming_score(labels, predictions)

    return {'hamming_score': hamming_score_val, 'hamming_loss': hamming_loss_val}

class LudwigDataset(Dataset):
    def __init__(self, csv_path, npy_folder, processor, mlb=None, fit_mlb=False):
        # Load the CSV file
        df = pd.read_csv(csv_path)

        # Filter rows based on the presence of corresponding .npy files in the npy folder
        valid_ids = [
            id_ for id_ in df['id'].astype(str)
            if os.path.isfile(os.path.join(npy_folder, f"{id_}.npy"))
        ]
        self.dataset = df[df['id'].astype(str).isin(valid_ids)].reset_index(drop=True)

        self.npy_folder = npy_folder
        self.processor = processor
        self.resample_rate = processor.sampling_rate
        print(f"Dataset size: {len(self.dataset)}")
        print(f"Sample rate: {self.resample_rate}")

        # One-hot encode subgenre labels
        if mlb is None:
            self.mlb = MultiLabelBinarizer()
        else:
            self.mlb = mlb

        if fit_mlb:
            self.labels = self.mlb.fit_transform(self.dataset['subgenres'].apply(eval))
        else:
            self.labels = self.mlb.transform(self.dataset['subgenres'].apply(eval))

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Load the audio file from npy folder
        row = self.dataset.iloc[idx]
        audio_path = os.path.join(self.npy_folder, f"{row['id']}.npy")
        audio = np.load(audio_path, mmap_mode='r')

        # Load one-hot encoded labels
        labels = torch.tensor(self.labels[idx], dtype=torch.float)

        # Process audio
        inputs = self.processor(audio, sampling_rate=self.resample_rate, return_tensors="pt", padding=True)
        inputs["labels"] = labels

        return {key: val.squeeze(0) for key, val in inputs.items()}


In [2]:
# design class weights for imbalanced dataset

df = pd.read_csv("/content/train.csv")
npy_folder = "/content/ludwig/npy_train"

# Filter rows based on the presence of corresponding .npy files in the npy folder
valid_ids = [
    id_ for id_ in df['id'].astype(str)
    if os.path.isfile(os.path.join(npy_folder, f"{id_}.npy"))
]
dataset = df[df['id'].astype(str).isin(valid_ids)].reset_index(drop=True)

# One-hot encode subgenre labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(dataset['subgenres'].apply(eval))

# Calculate class weights based on label frequencies
label_counts = np.sum(labels, axis=0)  # Sum across all samples for each class
print(f"Class Counts:\n{label_counts}")
total_samples = labels.shape[0]
class_frequencies = label_counts / total_samples  # Frequency of each class (subgenre)

# Inverse of the class frequencies to compute class weights
class_weights = total_samples / (len(mlb.classes_) * label_counts)

# Assign computed class weights to self.class_weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Class Weights:\n{class_weights}")

Class Counts:
[120 113   8  14  20   5  11  90 142 120  88 146 126 119 193 129 114 174
 182 104 117  65 114  25  55  72 101  80  13  34  15   1  21  25 128  91
 113  31 169 113 118 110 139 144 156 105 168  66  57 118 124 127  53]
Class Weights:
tensor([ 0.5138,  0.5457,  7.7075,  4.4043,  3.0830, 12.3321,  5.6055,  0.6851,
         0.4342,  0.5138,  0.7007,  0.4223,  0.4894,  0.5182,  0.3195,  0.4780,
         0.5409,  0.3544,  0.3388,  0.5929,  0.5270,  0.9486,  0.5409,  2.4664,
         1.1211,  0.8564,  0.6105,  0.7708,  4.7431,  1.8135,  4.1107, 61.6604,
         2.9362,  2.4664,  0.4817,  0.6776,  0.5457,  1.9890,  0.3649,  0.5457,
         0.5225,  0.5605,  0.4436,  0.4282,  0.3953,  0.5872,  0.3670,  0.9342,
         1.0818,  0.5225,  0.4973,  0.4855,  1.1634], device='cuda:0')


In [3]:
from transformers import AutoModel, Trainer, TrainingArguments, Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
from torch.nn.functional import binary_cross_entropy_with_logits

# Load DistilHuBERT processor and model
processor = Wav2Vec2FeatureExtractor.from_pretrained("pedromatias97/genre-recognizer-finetuned-gtzan_dset")
model = AutoModelForAudioClassification.from_pretrained(
    "pedromatias97/genre-recognizer-finetuned-gtzan_dset",
    num_labels=53,
    ignore_mismatched_sizes=True,
)
# set problem type for HuggingFace config
model.config.problem_type = "multi_label_classification"

# print model layers
for name, param in model.named_parameters():
    print(name)

"""
Layers:
  - hubert
  - projector (256 out)
  - classifier (256 in, num_classes out)
"""

# Replace the classification head with a multilabel-compatible one
model.classifier = nn.Linear(256, 53) # 256 comes from output dims of projector layer

# Override the forward method to handle logits directly for BCEWithLogitsLoss
def forward_with_loss(input_values, attention_mask=None, labels=None):
    # feature extractor (hubert)
    outputs = model.hubert(input_values, attention_mask=attention_mask)  # Using feature extractor from Hubert

    # embeddings through the projector layer
    projected_features = model.projector(outputs.last_hidden_state[:, 0, :])  # Assuming using CLS token for classification

    #  projected features through the classifier
    logits = model.classifier(projected_features)

    # print(torch.sigmoid(logits))
    if labels is not None:
        loss = binary_cross_entropy_with_logits(logits, labels.float(), weight=class_weights)
        return {"loss": loss, "logits": logits}
    return {"logits": logits}

model.forward = forward_with_loss

print(f"Model Parameters: {sum(p.numel() for p in model.parameters())/1e6:0.01f}M")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at pedromatias97/genre-recognizer-finetuned-gtzan_dset and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([10, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([53]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


hubert.masked_spec_embed
hubert.feature_extractor.conv_layers.0.conv.weight
hubert.feature_extractor.conv_layers.0.layer_norm.weight
hubert.feature_extractor.conv_layers.0.layer_norm.bias
hubert.feature_extractor.conv_layers.1.conv.weight
hubert.feature_extractor.conv_layers.2.conv.weight
hubert.feature_extractor.conv_layers.3.conv.weight
hubert.feature_extractor.conv_layers.4.conv.weight
hubert.feature_extractor.conv_layers.5.conv.weight
hubert.feature_extractor.conv_layers.6.conv.weight
hubert.feature_projection.projection.weight
hubert.feature_projection.projection.bias
hubert.encoder.pos_conv_embed.conv.bias
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0
hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1
hubert.encoder.layer_norm.weight
hubert.encoder.layer_norm.bias
hubert.encoder.layers.0.attention.k_proj.weight
hubert.encoder.layers.0.attention.k_proj.bias
hubert.encoder.layers.0.attention.v_proj.weight
hubert.encoder.layers.0.attention.v_

# Train MERT on Multi-label Audio Clips

In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Prepare dataset and dataloaders
train_dataset = LudwigDataset(
    csv_path='train.csv',
    npy_folder='/content/ludwig/npy_train',
    processor=processor,
    fit_mlb=True  # Fit the MultiLabelBinarizer on the training data
)

mlb = train_dataset.mlb

val_dataset = LudwigDataset(
    csv_path='test.csv',
    npy_folder='/content/ludwig/npy_test',
    processor=processor,
    mlb=mlb,  # Use the fitted MultiLabelBinarizer
    fit_mlb=False  # Do not refit on test data
)

# Instantiate the model
# model = GenreMultilabelModel(model, num_classes=len(train_dataset.mlb.classes_))
print(f"Number of classes: {len(train_dataset.mlb.classes_)}")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate based on steps
    eval_steps=50,  # Evaluate every 500 steps
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.5,
    # lr_scheduler_type="linear",
    # warmup_steps=500,
    logging_dir='./logs',
    fp16=True,
    # bf16=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Dataset size: 3268
Sample rate: 16000
Dataset size: 318
Sample rate: 16000
Number of classes: 53


Step,Training Loss,Validation Loss,Hamming Score,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
50,No log,0.288524,0.058005,0.305031,10.1264,31.403,3.95
100,No log,0.147,0.000393,0.044084,10.0062,31.78,3.998


KeyboardInterrupt: 

In [88]:
# trainer.evaluate()
import torch
torch.save(trainer.model.state_dict(), "model_weights_1227.pth")

# OneDrive Uploading

In [None]:
!curl https://rclone.org/install.sh | sudo bash
!mkdir /root/.config/rclone/
config = """[jmc]
type = onedrive
client_id = 9a4d2af8-46e3-49b7-959d-0d67b733a868
client_secret = S7F8Q~KuxCozDp69BJoWUwOZVDPURsnQpLQRpcRE
token = {"access_token":"eyJ0eXAiOiJKV1QiLCJub25jZSI6ImVOaTVYOEJFV2g4SGJ5cFdMdXFqX2d4eVh4X2RyeGtvTnlzX3gzM2hnTUUiLCJhbGciOiJSUzI1NiIsIng1dCI6Inp4ZWcyV09OcFRrd041R21lWWN1VGR0QzZKMCIsImtpZCI6Inp4ZWcyV09OcFRrd041R21lWWN1VGR0QzZKMCJ9.eyJhdWQiOiIwMDAwMDAwMy0wMDAwLTAwMDAtYzAwMC0wMDAwMDAwMDAwMDAiLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC81ODljNzZmNS1jYTE1LTQxZjktODg0Yi01NWVjMTVhMDY3MmEvIiwiaWF0IjoxNzMyODMyNDE4LCJuYmYiOjE3MzI4MzI0MTgsImV4cCI6MTczMjgzNDUxOCwiYWNjdCI6MCwiYWNyIjoiMSIsImFjcnMiOlsiYzEiXSwiYWlvIjoiQVZRQXEvOFlBQUFBQUhsYmoxRW13eWRqWjlWZk9JOERabkszT1hCRXpUamxCUUEvY2dkVkYvY0hWUVQ3K0I5NzRLU0ZXYzR1amZoK0dNRFBKdFRuSFB5UndWZlJlR0dvS2U2QjhHL3dUVGpDYU44d2tXdUE3bVU9IiwiYW1yIjpbInB3ZCIsIm1mYSJdLCJhcHBfZGlzcGxheW5hbWUiOiJyY2xvbmUiLCJhcHBpZCI6IjlhNGQyYWY4LTQ2ZTMtNDliNy05NTlkLTBkNjdiNzMzYTg2OCIsImFwcGlkYWNyIjoiMSIsImZhbWlseV9uYW1lIjoiQ2hhbiIsImdpdmVuX25hbWUiOiJKYXJlZCIsImlkdHlwIjoidXNlciIsImlwYWRkciI6IjEwOC4yNi4xNzkuMjAzIiwibmFtZSI6IkNoYW4sIEphcmVkIiwib2lkIjoiZGNhZWFiZGMtOWJhOS00MDU0LWFiMzEtYjBlMjliMDhjMGQxIiwib25wcmVtX3NpZCI6IlMtMS01LTIxLTEwMjk5ODcxNTQtMTMzMDczMzExMC0zMjY1NjkxNDctMTYwMzU3IiwicGxhdGYiOiIzIiwicHVpZCI6IjEwMDMyMDAwQzE1OTA3OTUiLCJyaCI6IjEuQVVVQTlYYWNXQlhLLVVHSVMxWHNGYUJuS2dNQUFBQUFBQUFBd0FBQUFBQUFBQUJmQWNwRkFBLiIsInNjcCI6IkZpbGVzLlJlYWQgRmlsZXMuUmVhZC5BbGwgRmlsZXMuUmVhZFdyaXRlIEZpbGVzLlJlYWRXcml0ZS5BbGwgU2l0ZXMuUmVhZC5BbGwgcHJvZmlsZSBvcGVuaWQgZW1haWwiLCJzdWIiOiJhUnpnMUp3NmJoWHg5di1VaGtXVHY4OEsycmJrbS1TYWZBaWhua0lDeUxzIiwidGVuYW50X3JlZ2lvbl9zY29wZSI6Ik5BIiwidGlkIjoiNTg5Yzc2ZjUtY2ExNS00MWY5LTg4NGItNTVlYzE1YTA2NzJhIiwidW5pcXVlX25hbWUiOiJqY2hhbjNAd3BpLmVkdSIsInVwbiI6ImpjaGFuM0B3cGkuZWR1IiwidXRpIjoicjUwYXNoSE9aRUNXWjdQM2N6MG1BQSIsInZlciI6IjEuMCIsIndpZHMiOlsiYjc5ZmJmNGQtM2VmOS00Njg5LTgxNDMtNzZiMTk0ZTg1NTA5Il0sInhtc19pZHJlbCI6IjEgMjYiLCJ4bXNfc3QiOnsic3ViIjoiTVd2MDlVNWxIN0E2UHRfM0ZaWXAxMk9LbC1ySHZRbDA2NU5ycjNNamNzNCJ9LCJ4bXNfdGNkdCI6MTQxMTQwMzMzOX0.pRiGvIdsURBnLQ3Rc885B6Dx6mlXQw7pfLGkrzalIKPVRfF-lurSf5es73XeyG81jK2BXvJ5cdUXnsA1Vu2wmsCTM5Mdbc-9p8EGn8Q5PB14CZqY6YqGHAkBa-qY-vDlj1rEptKSS_Xo_Rk83ffDVtsG_K6kdhlEw3H90ebm27Wk64bL_WupBNJxhFwV9fpBi6rQkFmxMMUeolE_usrcDRb3_Y7H0qzkzHZxeEc6voLSHXgOmMBCUBqcFWneKmzSGnO2Qh64sA5faB-dkvWF0fMaoezoMK9k_3gkvjTze80tiqKvW9NYVB_7hjtAXpRja8Pb5-ZQ3K6GehnSlVNpaQ","token_type":"Bearer","refresh_token":"1.AUUA9XacWBXK-UGIS1XsFaBnKvgqTZrjRrdJlZ0NZ7czqGhfAcpFAA.AgABAwEAAADW6jl31mB3T7ugrWTT8pFeAwDs_wUA9P-jXM1KD7yRqfeiTw3RiwIubrIF3Ru6z0rD0t8BwAd9WHuaeAHFaD-OcyXaBbYRJCG-RPuwzPtGv3E61da-0-2b8fMbVQkC4xftVp4OM3UIQD1wzH79bf_UnxcSYbsW05O5FsHJw8c73sNZy9ko42Ke409Mx49EdJg7Ibb6sUJlO3sKAD1k3PtidlArw8Lu-eTlF0pFEoiJMMl7tutXcZSiQSP4UoeVVOyuNAhKI47h1QVJk1NtkaD2FtriQ7vY2iCMpfUpSQLa3obbCXk9G4hIMXxAAXcN0tJTWAPm3nI3R6PTnDhBeT30mG_C2qukK1bzAGll67EzV1jCp2faSHZiPOgtviYOln46_miRFo0vLR7WyaTdBuUi3nRq49sPKS8pJuPMPCjyKTDckyx8UBUNL4nzd4oevuEcBtuC8gHnZ_cYnMCZ5FfSTWc8scV8aTtMb0is0u2I6T3BgfWiFmu2QODPp8LAsvnmvzizcxxP7fpil3_P2IgCr_v5qgXegtKcywRN50WEPebUEiLtPZzxzuBrMP_ocIBLuBN1zo_jiU9LKylkFl7T9_dQOBPUfjuZP4nct1JobZeOYPiQRCfuOFn0TlwaPBcAijL-iULjvpebvhBuYYXXsovmvfWMkhUJPdMQxQNSFwAUYNVGoScMHHRTDnPRDy2RMhco-rbcqsYEMa5eacYSO_mUDdwY51JIEH7p1Q-qwoYa3NZq0IOdr8Amke-mXEFfBsmZx_OIx_NSQbZ-1nwI9Fm3u72FFR_RpfIm5niqa7ieZrqKHeO-wIr08dR6","expiry":"2024-11-28T17:55:17.8636308-05:00"}
drive_id = b!xveGKI1Ss0mbds2TAgtmMp5-fY1UTg5ImOj-tm2d6hHY3mzzWk7pQ6dT8j8Ke4Im
drive_type = business
"""
with open('/root/.config/rclone/rclone.conf', 'w') as file:
  file.write(config)

In [89]:
!rclone copy /content/model_weights_1227.pth jmc:musiccaps/mert_ckpt/pth_1227 -v --stats 10s

2024/12/03 21:45:02 INFO  : 
Transferred:   	       50 MiB / 360.242 MiB, 14%, 5.556 MiB/s, ETA 55s
Transferred:            0 / 1, 0%
Elapsed time:        14.3s
Transferring:
 *                        model_weights_1227.pth: 13% /360.242Mi, 5.556Mi/s, 55s

2024/12/03 21:45:12 INFO  : 
Transferred:   	      140 MiB / 360.242 MiB, 39%, 7.425 MiB/s, ETA 29s
Transferred:            0 / 1, 0%
Elapsed time:        24.3s
Transferring:
 *                        model_weights_1227.pth: 38% /360.242Mi, 7.425Mi/s, 29s

2024/12/03 21:45:22 INFO  : 
Transferred:   	      240 MiB / 360.242 MiB, 67%, 8.684 MiB/s, ETA 13s
Transferred:            0 / 1, 0%
Elapsed time:        34.3s
Transferring:
 *                        model_weights_1227.pth: 66% /360.242Mi, 8.684Mi/s, 13s

2024/12/03 21:45:32 INFO  : 
Transferred:   	      340 MiB / 360.242 MiB, 94%, 9.344 MiB/s, ETA 2s
Transferred:            0 / 1, 0%
Elapsed time:        44.3s
Transferring:
 *                        model_weights_1227.pth: 94% /

# Evaluation

In [96]:
import torch
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import numpy as np
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the evaluation function
def evaluate_model(model, dataset, batch_size=16, threshold=0.5):
    # Create a DataLoader for batching
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model.eval()  # Set model to evaluation mode
    y_true = []
    y_pred = []

    with torch.no_grad():  # Disable gradient computation for inference
        for batch in dataloader:
            # Get the inputs and labels from the batch
            inputs = batch['input_values'].to(device)  # Assuming 'input_values' contains the features
            labels = batch['labels'].cpu().numpy()  # True labels in multi-hot format

            # Forward pass through the model
            logits = model(inputs)['logits']  # Logits of the model output

            # Apply sigmoid to logits for multi-label classification
            pred_probs = torch.sigmoid(logits).cpu().numpy()  # Get probabilities
            # print(pred_probs)

            # Apply threshold to get predicted labels
            pred_labels = (pred_probs > threshold).astype(int)

            # Store true and predicted labels
            y_true.append(labels)
            y_pred.append(pred_labels)

    # Flatten the lists of true and predicted labels
    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    # Compute classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=train_dataset.mlb.classes_))

    # Compute multilabel confusion matrix
    print("\nMultilabel Confusion Matrix:")
    conf_matrix = multilabel_confusion_matrix(y_true, y_pred)
    print(conf_matrix)

    return classification_report(y_true, y_pred, output_dict=True), conf_matrix


In [97]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, pipeline

# loading our model weights
mert_model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M",
                                  trust_remote_code=True)
# Load the saved model and processor
model = MERTMultilabelModel(mert_model, num_classes=53)

# Load the weights into the model
model.load_state_dict(torch.load("model_weights_1227.pth"), strict=False)
model.to(device)

# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M",
                                                     trust_remote_code=True)

mlb = train_dataset.mlb

test_dataset = LudwigDataset(
    csv_path='test.csv',
    npy_folder='/content/ludwig/npy_test',
    processor=processor,
    mlb=mlb,  # Use the fitted MultiLabelBinarizer
    fit_mlb=False  # Do not refit on test data
)
print(test_dataset[100]["labels"])

# Evaluate on test set
evaluation_results, conf_matrix = evaluate_model(model, test_dataset, batch_size=2, threshold=0.07)

  model.load_state_dict(torch.load("model_weights_1227.pth"), strict=False)


Dataset size: 318
Sample rate: 24000
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Classification Report:
                              precision    recall  f1-score   support

       blues---country blues       0.00      0.00      0.00         9
      blues---electric blues       0.00      0.00      0.00         6
         classical---baroque       0.00      0.00      0.00         2
       classical---classical       0.00      0.00      0.00         3
          classical---modern       0.00      0.00      0.00         3
           classical---opera       0.00      0.00      0.00         1
        classical---romantic       0.00      0.00      0.00         3
        electronic---ambient       0.00      0.00      0.00        16
          electronic---disco       0.12      0.64      0.20        28
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
