In [None]:
import os
import csv
import json
import time
import math
from collections import Counter

import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from torch.utils.data import random_split

In [None]:
class MusicXMLDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_len=512):
        # Load the preprocessed entries from the JSON file
        with open(json_path, 'r', encoding='utf-8') as f:
            self.entries = json.load(f)
        # (Optional) Filter entries if needed (using the same criteria as original)
        self.entries = [entry for entry in self.entries if (
            "/mxl/0/" in entry['mxl'] or
            "/mxl/1/" in entry['mxl'] or
            "/mxl/2/" in entry['mxl'] or
            "/mxl/3/" in entry['mxl'] or
            "/mxl/4/" in entry['mxl']
        )]
        # Enumerate unique genres and create a mapping to indices
        unique_genres = sorted({entry['primary_genre'] for entry in self.entries})
        print("PRIMARY GENRES:", set(unique_genres))
        self.genre_to_idx = {genre: idx for idx, genre in enumerate(unique_genres)}
        self.idx_to_genre = {idx: genre for genre, idx in self.genre_to_idx.items()}
        
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries[idx]
        # Parse the token list from the JSON field (it's stored as a JSON string)
        tokens = json.loads(entry['tokens'])  # this yields a list of token strings
        # Convert the list of tokens to the format expected by the tokenizer.
        # We use the tokenizer to get input IDs and attention mask, padding to max_len.
        encoding = self.tokenizer(
            tokens, 
            is_split_into_words=True,       # treat the list of tokens as pre-split words
            add_special_tokens=True,        # add [CLS], [SEP] as needed for the model
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)         # tensor of shape (max_len)
        attention_mask = encoding['attention_mask'].squeeze(0)  # tensor of shape (max_len)
        # Genre label to index
        genre_str = entry['primary_genre']
        label = torch.tensor(self.genre_to_idx[genre_str], dtype=torch.long)
        return input_ids, attention_mask, label


In [None]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [None]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro', zero_division=0)
    accuracy = sum([p == l for p, l in zip(all_preds, all_labels)]) / len(all_labels)
    return total_loss / len(dataloader), accuracy, precision, recall, f1


In [None]:
def plot_training_curves(train_losses, val_losses, val_accuracies, val_precisions, val_recalls, val_f1s):
    epochs = range(1, len(train_losses) + 1)
    
    plt.figure(figsize=(14, 5))

    # Subplot 1: Losses
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Train Loss', linestyle='--', marker='o')
    plt.plot(epochs, val_losses, label='Val Loss', linestyle='-', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training & Validation Loss')
    plt.legend()
    plt.grid(True)

    # Subplot 2: Metrics
    plt.subplot(1, 2, 2)
    plt.plot(epochs, val_accuracies, label='Accuracy', marker='o')
    plt.plot(epochs, val_precisions, label='Precision', marker='o')
    plt.plot(epochs, val_recalls, label='Recall', marker='o')
    plt.plot(epochs, val_f1s, label='F1 Score', marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Score')
    plt.title('Validation Metrics')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig("genre_curves.png")
    plt.show()

In [None]:
def train_with_early_stopping(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, patience=3):
    train_losses = []
    val_losses = []
    val_accuracies = []
    val_precisions = []
    val_recalls = []
    val_f1s = []

    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate_model(model, val_loader, criterion, device)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        val_precisions.append(val_precision)
        val_recalls.append(val_recall)
        val_f1s.append(val_f1)

        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | "
              f"P: {val_precision:.4f} | R: {val_recall:.4f} | F1: {val_f1:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    plot_training_curves(train_losses, val_losses, val_accuracies, val_precisions, val_recalls, val_f1s)
    return model

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set your seed
set_seed(42)

In [None]:
# Choose pre-trained model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your dataset
json_path = "/kaggle/input/mlx-dataset/preprocessed_dataset.json"  # <-- put your JSON path here
dataset = MusicXMLDataset(json_path, tokenizer, max_len=512)
num_classes = len(dataset.genre_to_idx)

# Load pre-trained model with correct number of output classes
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Freeze all layers initially
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last layer (classifier)
for param in model.classifier.parameters():
    param.requires_grad = True
    
# Train-validation-test split
train_size = int(0.8 * len(dataset))  # 80% for training
val_size = int(0.1 * len(dataset))    # 10% for validation
test_size = len(dataset) - train_size - val_size  # Remaining 10% for testing

# Split the dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and Loss
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
criterion = nn.CrossEntropyLoss()


In [None]:
trained_model = train_with_early_stopping(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    num_epochs=100,
    patience=5
)

In [None]:
torch.save(trained_model.state_dict(), "genre_classifier.pt")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Function to evaluate the model
def evaluate_model(model, test_dataloader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculating accuracy, precision, recall, f1 score
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    
    return accuracy, precision, recall, f1, cm

# Function to plot the confusion matrix
def plot_confusion_matrix(cm, class_names):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig("confusion_matrix")
    plt.show()

# Example usage (after training the model and defining a test_dataloader):
accuracy, precision, recall, f1, cm = evaluate_model(model, test_dataloader, device)
plot_confusion_matrix(cm, class_names=unique_genres)
