<a href="https://colab.research.google.com/github/15gk/emotion-text/blob/main/MusicGenerationPretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
%%capture
!pip install torch transformers music21 pandas

In [56]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import MusicgenForConditionalGeneration, AutoProcessor, Trainer, TrainingArguments
from music21 import converter, instrument, note, chord
import numpy as np


In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [58]:
class EMOPIAMusicDataset(Dataset):
    def __init__(self, midi_files, emotions, emotion_mapping):
        self.midi_files = midi_files
        self.emotions = emotions
        self.emotion_mapping = emotion_mapping

        # Create note-to-index mapping
        self.notes = self._extract_all_notes()
        self.note_to_index = {note: idx for idx, note in enumerate(self.notes)}
        self.index_to_note = {idx: note for note, idx in self.note_to_index.items()}

        # Extract and process note sequences
        self.processed_sequences = self._process_midi_files()

    def _extract_all_notes(self):
        all_notes = []
        for midi_file in self.midi_files:
            for part in midi_file.parts:
                for element in part.recurse():
                    if isinstance(element, note.Note):
                        all_notes.append(str(element.pitch))
                    elif isinstance(element, chord.Chord):
                        all_notes.append('.'.join(str(n) for n in element.normalOrder))

        return sorted(list(set(all_notes)))

    def _extract_notes(self, midi_file):
        notes = []
        for part in midi_file.parts:
            for element in part.recurse():
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append('.'.join(str(n) for n in element.normalOrder))
        return notes

    def _process_midi_files(self):
        processed_sequences = []
        for midi_file in self.midi_files:
            notes = self._extract_notes(midi_file)
            note_indices = [self.note_to_index[note] for note in notes]
            processed_sequences.append(note_indices)
        return processed_sequences

    def __len__(self):
        return len(self.midi_files)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.processed_sequences[idx]),
            'labels': torch.tensor(self.emotions[idx] - 1)  # Convert to 0-based index
        }


In [59]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [70]:
class EmotionMusicGenerationModel(nn.Module):
    def __init__(self, num_notes, num_emotions, model_name="facebook/musicgen-small"):
        super().__init__()

        # Emotion text mapping for generation
        self.emotion_text_mapping = {
            0: "High energy, exciting, and happy music",
            1: "Calm and positive music with gentle progression",
            2: "Tense and dramatic music with intense feelings",
            3: "Peaceful and soft music with subtle emotional depth"
        }

        # Load pre-trained MusicGen model
        self.musicgen_model = MusicgenForConditionalGeneration.from_pretrained(model_name)
        self.processor = AutoProcessor.from_pretrained(model_name)

        # Custom embedding and classification layers
        self.note_embedding = nn.Embedding(num_notes, 512)
        self.emotion_classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),  # Add dropout for regularization
            nn.Linear(256, num_emotions)
        )

        # Loss function
        self.emotion_loss = nn.CrossEntropyLoss()

    def forward(self, input_ids=None, labels=None):
        if input_ids is not None:
            # Embed note sequences
            embedded_notes = self.note_embedding(input_ids)
            pooled_notes = torch.mean(embedded_notes, dim=1)
            emotion_logits = self.emotion_classifier(pooled_notes)

            if labels is not None:
                loss = self.emotion_loss(emotion_logits, labels)
                _, predicted = torch.max(emotion_logits, 1)
                accuracy = (predicted == labels).sum().item() / labels.size(0) * 100
                return {'loss': loss, 'accuracy': accuracy}

            return emotion_logits

        # Generation logic
        if labels is not None:
            emotion_label = labels[0].item()
            emotion_text = self.emotion_text_mapping[emotion_label]

            inputs = self.processor(
                text=[emotion_text],
                audio=None,
                sampling_rate=self.musicgen_model.config.audio_encoder.sampling_rate,
                return_tensors="pt"
            )

            audio_values = self.musicgen_model.generate(
                **inputs,
                max_new_tokens=250,
                temperature=0.7
            )

            return audio_values

In [71]:
def prepare_emopia_dataset(label_path, midi_filepath):
    """
    Prepare EMOPIA dataset for training

    Args:
        label_path (str): Path to labels CSV
        midi_filepath (str): Path to MIDI files

    Returns:
        Prepared dataset and emotion mapping
    """
    # Load labels
    labels_df = pd.read_csv(label_path, delimiter=",")

    # Emotion mapping
    emotion_mapping = {
        1: "HVHA",  # High Valence, High Arousal
        2: "HVLA",  # High Valence, Low Arousal
        3: "LVHA",  # Low Valence, High Arousal
        4: "LVLA"   # Low Valence, Low Arousal
    }

    # Create file to emotion mapping
    file_to_emotion = dict(zip(labels_df["ID"], labels_df["4Q"]))

    # Load MIDI files
    midi_files = []
    file_emotions = []

    for file in os.listdir(midi_filepath):
        if file.endswith(".mid"):
            file_id = file.split(".mid")[0]
            if file_id in file_to_emotion:
                full_path = os.path.join(midi_filepath, file)
                midi = converter.parse(full_path)
                midi_files.append(midi)
                file_emotions.append(file_to_emotion[file_id])

    # Create dataset
    dataset = EMOPIAMusicDataset(midi_files, file_emotions, emotion_mapping)

    return dataset, emotion_mapping

In [72]:
def custom_collate_fn(features):
    # Extract input_ids and labels
    input_ids = [item['input_ids'] for item in features]
    labels = torch.tensor([item['labels'] for item in features])

    # Pad input_ids to the maximum sequence length
    padded_input_ids = torch.nn.utils.rnn.pad_sequence(
        input_ids,
        batch_first=True,
        padding_value=0
    )

    return {
        'input_ids': padded_input_ids.cpu(),  # Ensure CPU tensor
        'labels': labels.cpu()  # Ensure CPU tensor
    }

In [76]:
def train_model(model, dataset, num_epochs=100, learning_rate=1e-4):
    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, collate_fn=custom_collate_fn)

    # Move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        total_train_accuracy = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids, labels)
            loss = outputs['loss']

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_accuracy += outputs['accuracy']

        # Validation
        model.eval()
        total_val_loss = 0
        total_val_accuracy = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, labels)
                total_val_loss += outputs['loss'].item()
                total_val_accuracy += outputs['accuracy']

        train_accuracy = total_train_accuracy / len(train_loader)
        val_accuracy = total_val_accuracy / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {total_train_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.2f}%")
        print(f"Validation Loss: {total_val_loss/len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    # Save the model
    torch.save(model.state_dict(), 'emotion_music_generation_model.pth')
    return model

In [67]:
!unzip /content/EMOPIA_1.0.zip

Archive:  /content/EMOPIA_1.0.zip
replace __MACOSX/EMOPIA_1.0/._tagging_lists? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [77]:
def main():
    label_path = "/content/EMOPIA_1.0/label.csv"
    midi_filepath = "/content/EMOPIA_1.0/midis"

    dataset, emotion_mapping = prepare_emopia_dataset(label_path, midi_filepath)

    model = EmotionMusicGenerationModel(
        num_notes=len(dataset.note_to_index),
        num_emotions=len(emotion_mapping)
    )

    # Train model
    trained_model = train_model(model, dataset)

In [75]:
if __name__ == "__main__":
    main()

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

Epoch 1/50
Train Loss: 1.3596, Train Accuracy: 33.80%
Validation Loss: 1.3731, Validation Accuracy: 31.94%
Epoch 2/50
Train Loss: 1.3233, Train Accuracy: 41.55%
Validation Loss: 1.3236, Validation Accuracy: 39.35%
Epoch 3/50
Train Loss: 1.2865, Train Accuracy: 43.87%
Validation Loss: 1.3102, Validation Accuracy: 36.11%
Epoch 4/50
Train Loss: 1.2560, Train Accuracy: 44.68%
Validation Loss: 1.2706, Validation Accuracy: 43.06%
Epoch 5/50
Train Loss: 1.2179, Train Accuracy: 47.92%
Validation Loss: 1.2348, Validation Accuracy: 43.98%
Epoch 6/50
Train Loss: 1.1914, Train Accuracy: 48.50%
Validation Loss: 1.2176, Validation Accuracy: 44.91%
Epoch 7/50
Train Loss: 1.1308, Train Accuracy: 53.70%
Validation Loss: 1.2325, Validation Accuracy: 43.98%
Epoch 8/50
Train Loss: 1.1307, Train Accuracy: 51.16%
Validation Loss: 1.1843, Validation Accuracy: 46.30%
Epoch 9/50
Train Loss: 1.0901, Train Accuracy: 54.28%
Validation Loss: 1.1630, Validation Accuracy: 47.69%
Epoch 10/50
Train Loss: 1.0974, Train