In [1]:
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Concatenate, Input
from tensorflow.keras.callbacks import EarlyStopping

from collections import defaultdict
from music21 import converter, note, chord
from sklearn.metrics import classification_report
import keras_tuner as kt

import warnings
warnings.filterwarnings('ignore')

# random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


## Data Pre-processing

In [2]:
dataset_root = '../Composer_Dataset/NN_midi_files_extended'
splits = ['train', 'dev', 'test']

# Inspect dataset
composer_counts = defaultdict(int)
total_files = 0

for split in splits:
    split_path = os.path.join(dataset_root, split)
    for composer in os.listdir(split_path):
        composer_dir = os.path.join(split_path, composer)
        if os.path.isdir(composer_dir):
            count = len([f for f in os.listdir(composer_dir) if f.endswith('.mid')])
            composer_counts[composer] += count
            total_files += count

print(f"Total MIDI files: {total_files}")
print(f"Composers: {list(composer_counts.keys())}")
print("Files per composer:")
for composer, count in composer_counts.items():
    print(f"   - {composer}: {count}")

Total MIDI files: 439
Composers: ['mozart', 'chopin', 'handel', 'byrd', 'schumann', 'mendelssohn', 'hummel', 'bach', 'bartok']
Files per composer:
   - mozart: 49
   - chopin: 49
   - handel: 49
   - byrd: 50
   - schumann: 44
   - mendelssohn: 49
   - hummel: 50
   - bach: 50
   - bartok: 49


## Feature Extraction

In [3]:
def extract_features(midi_path):
    midi = converter.parse(midi_path)
    sequence = []
    tempos = []
    
    for el in midi.flat.notes:
        if isinstance(el, note.Note):
            sequence.append(str(el.pitch))
        elif isinstance(el, chord.Chord):
            sequence.append('.'.join(str(n) for n in el.normalOrder))
    
    for t in midi.flat.getElementsByClass('MetronomeMark'):
        tempos.append(t.number)
    
    avg_tempo = np.mean(tempos) if tempos else 120  # default tempo if none found
    return sequence, avg_tempo

X = []
tempos_data = []
y = []

for split in splits:
    split_path = os.path.join(dataset_root, split)
    for composer in os.listdir(split_path):
        composer_dir = os.path.join(split_path, composer)
        if os.path.isdir(composer_dir):
            for fname in os.listdir(composer_dir):
                if fname.endswith('.mid'):
                    path = os.path.join(composer_dir, fname)
                    seq, tempo = extract_features(path)
                    X.append(seq)
                    tempos_data.append(tempo)
                    y.append(composer)

print(f"Extracted {len(X)} sequences with labels and tempos.")

Extracted 439 sequences with labels and tempos.


In [4]:
X_str = [' '.join(seq) for seq in X]
label_enc = LabelEncoder()
y_enc = label_enc.fit_transform(y)
y_cat = to_categorical(y_enc)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_str)
X_tok = tokenizer.texts_to_sequences(X_str)

vocab_size = len(tokenizer.word_index) + 1
num_classes = y_cat.shape[1]
maxlen = 200

X_pad = pad_sequences(X_tok, maxlen=maxlen, padding='post', truncating='post')
tempos = np.array(tempos_data).reshape(-1, 1)

# Train/val/test split
X_train, X_test, y_train, y_test, tempo_train, tempo_test = train_test_split(
    X_pad, y_cat, tempos, test_size=0.2, stratify=y_cat, random_state=42
)
X_train, X_val, y_train, y_val, tempo_train, tempo_val = train_test_split(
    X_train, y_train, tempo_train, test_size=0.1, stratify=y_train, random_state=42
)

tempo_train = tempo_train.reshape(-1, 1)
tempo_val = tempo_val.reshape(-1, 1)

print(f"vocab_size={vocab_size}, maxlen={maxlen}, num_classes={num_classes}")

vocab_size=72, maxlen=200, num_classes=9


In [5]:

print(f"X_train shape: {X_train.shape}, tempo_train shape: {tempo_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (315, 200), tempo_train shape: (315, 1), y_train shape: (315, 9)


## Model Building

### LSTM

In [6]:
def build_lstm():
    seq_input = Input(shape=(maxlen,))
    tempo_input = Input(shape=(1,))
    
    x = Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen)(seq_input)
    x = LSTM(64)(x)
    x = Dense(32, activation='relu')(x)
    
    combined = Concatenate()([x, tempo_input])
    output = Dense(num_classes, activation='softmax')(combined)
    
    model = Model(inputs=[seq_input, tempo_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm()
lstm_model.summary()

In [7]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history_lstm = lstm_model.fit(
    [X_train, tempo_train], y_train,
    validation_data=([X_val, tempo_val], y_val),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop]
)



Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step - accuracy: 0.1032 - loss: 38.3287 - val_accuracy: 0.1111 - val_loss: 36.1136
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1032 - loss: 37.2644 - val_accuracy: 0.1111 - val_loss: 34.9413
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1193 - loss: 36.0225 - val_accuracy: 0.1667 - val_loss: 33.5618
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1347 - loss: 34.6787 - val_accuracy: 0.1944 - val_loss: 32.1484
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1274 - loss: 33.2862 - val_accuracy: 0.1944 - val_loss: 30.9505
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.1185 - loss: 32.1319 - val_accuracy: 0.1667 - val_loss: 29.8105
Epoch 7/50
[1m5/5[0m [32m━━━━━━

### CNN

In [8]:
def build_cnn():
    seq_input = Input(shape=(maxlen,))
    tempo_input = Input(shape=(1,))
    
    x = Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen)(seq_input)
    x = Conv1D(64, 3, activation='relu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(2)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    
    combined = Concatenate()([x, tempo_input])
    output = Dense(num_classes, activation='softmax')(combined)
    
    model = Model(inputs=[seq_input, tempo_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
cnn_model = build_cnn()
cnn_model.summary()

In [9]:
history_cnn = cnn_model.fit(
    [X_train, tempo_train], y_train,
    validation_data=([X_val, tempo_val], y_val),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.1014 - loss: 31.5605 - val_accuracy: 0.1111 - val_loss: 24.6957
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1174 - loss: 23.2523 - val_accuracy: 0.0278 - val_loss: 18.2682
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.0778 - loss: 17.3693 - val_accuracy: 0.2222 - val_loss: 13.0349
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1790 - loss: 13.9161 - val_accuracy: 0.1667 - val_loss: 10.9088
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1431 - loss: 11.4924 - val_accuracy: 0.0833 - val_loss: 8.5824
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1297 - loss: 9.2609 - val_accuracy: 0.1944 - val_loss: 6.3005
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━

## Model Evaluation

#### LSTM Evaluation

In [11]:
y_true = y_test.argmax(axis=1)
y_pred_lstm = lstm_model.predict([X_test, tempo_test]).argmax(axis=1)
print("LSTM Classification Report:")
print(classification_report(y_true, y_pred_lstm, target_names=label_enc.classes_))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

        bach       0.20      0.40      0.27        10
      bartok       0.15      0.20      0.17        10
        byrd       0.00      0.00      0.00        10
      chopin       0.18      0.20      0.19        10
      handel       0.00      0.00      0.00        10
      hummel       0.25      0.10      0.14        10
 mendelssohn       0.11      0.10      0.11        10
      mozart       0.00      0.00      0.00        10
    schumann       0.12      0.38      0.18         8

    accuracy                           0.15        88
   macro avg       0.11      0.15      0.12        88
weighted avg       0.11      0.15      0.12        88



#### Predicted vs Actual Composer

In [12]:
y_pred_names = label_enc.inverse_transform(y_pred_lstm)
y_true_names = label_enc.inverse_transform(y_true)

# Print a few samples
for i in range(10):  # show first 10
    print(f"Sample {i+1}: True: {y_true_names[i]} | Predicted: {y_pred_names[i]}")

Sample 1: True: mozart | Predicted: byrd
Sample 2: True: handel | Predicted: bartok
Sample 3: True: bartok | Predicted: hummel
Sample 4: True: mendelssohn | Predicted: bach
Sample 5: True: hummel | Predicted: bach
Sample 6: True: bartok | Predicted: byrd
Sample 7: True: chopin | Predicted: hummel
Sample 8: True: schumann | Predicted: bach
Sample 9: True: bach | Predicted: schumann
Sample 10: True: handel | Predicted: chopin


#### CNN Evaluation

In [13]:

y_pred_cnn = cnn_model.predict([X_test, tempo_test]).argmax(axis=1)
print("CNN Classification Report:")
print(classification_report(y_true, y_pred_cnn, target_names=label_enc.classes_))

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
CNN Classification Report:
              precision    recall  f1-score   support

        bach       0.18      0.30      0.22        10
      bartok       0.23      0.30      0.26        10
        byrd       0.33      0.30      0.32        10
      chopin       0.50      0.10      0.17        10
      handel       0.57      0.40      0.47        10
      hummel       0.30      0.60      0.40        10
 mendelssohn       0.00      0.00      0.00        10
      mozart       0.21      0.30      0.25        10
    schumann       0.00      0.00      0.00         8

    accuracy                           0.26        88
   macro avg       0.26      0.26      0.23        88
weighted avg       0.26      0.26      0.24        88



In [14]:
y_pred_names = label_enc.inverse_transform(y_pred_cnn)
y_true_names = label_enc.inverse_transform(y_true)

# Print a few samples
for i in range(10):  # show first 10
    print(f"Sample {i+1}: True: {y_true_names[i]} | Predicted: {y_pred_names[i]}")

Sample 1: True: mozart | Predicted: hummel
Sample 2: True: handel | Predicted: handel
Sample 3: True: bartok | Predicted: mozart
Sample 4: True: mendelssohn | Predicted: byrd
Sample 5: True: hummel | Predicted: hummel
Sample 6: True: bartok | Predicted: hummel
Sample 7: True: chopin | Predicted: hummel
Sample 8: True: schumann | Predicted: mozart
Sample 9: True: bach | Predicted: bach
Sample 10: True: handel | Predicted: handel


## Model Optimization

#### LSTM Optimization

In [15]:
def build_lstm_tuned(hp):
    seq_input = Input(shape=(maxlen,))
    tempo_input = Input(shape=(1,))
    x = Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen)(seq_input)
    x = LSTM(hp.Int('lstm1', 64, 256, step=64), return_sequences=True)(x)
    x = LSTM(hp.Int('lstm2', 32, 128, step=32))(x)
    x = Dense(hp.Int('dense', 32, 128, step=32), activation='relu')(x)
    combined = Concatenate()([x, tempo_input])
    output = Dense(num_classes, activation='softmax')(combined)
    model = Model(inputs=[seq_input, tempo_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [16]:
lstm_tuner = kt.Hyperband(
    build_lstm_tuned,
    objective='val_accuracy',
    max_epochs=20,
    factor=3,
    directory='tuning',
    project_name='lstm_tuning'
)

lstm_tuner.search([X_train, tempo_train], y_train, epochs=20, validation_data=([X_val, tempo_val], y_val))
best_lstm = lstm_tuner.get_best_models(1)[0]


Trial 6 Complete [00h 00m 04s]
val_accuracy: 0.1111111119389534

Best val_accuracy So Far: 0.1944444477558136
Total elapsed time: 00h 00m 33s

Search: Running Trial #7

Value             |Best Value So Far |Hyperparameter
256               |128               |lstm1
96                |64                |lstm2
96                |128               |dense
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 287ms/step - accuracy: 0.1093 - loss: 16.3723 - val_accuracy: 0.0556 - val_loss: 13.1997
Epoch 2/3
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 261ms/step - accuracy: 0.1342 - loss: 11.5138 - val_accuracy: 0.1667 - val_loss: 9.6731
Epoch 3/3
[1m 5/10[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 259ms/step - accuracy: 0.1276 - los

KeyboardInterrupt: 

#### CNN Optimization

In [None]:
def build_cnn_tuned(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen))
    model.add(Conv1D(
        filters=hp.Int('conv1_filters', 32, 128, step=32),
        kernel_size=hp.Choice('conv1_kernel', [3, 5]),
        activation='relu'
    ))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(
        filters=hp.Int('conv2_filters', 64, 256, step=64),
        kernel_size=hp.Choice('conv2_kernel', [3, 5]),
        activation='relu'
    ))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(hp.Int('dense_units', 32, 128, step=32), activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model



In [None]:
cnn_tuner = kt.Hyperband(
    build_cnn_tuned,
    objective='val_accuracy',
    max_epochs=20,
    factor=3,
    directory='tuning',
    project_name='cnn_tuning'
)

cnn_tuner.search(X_train, y_train, epochs=20, validation_data=(X_val, y_val))
best_cnn = cnn_tuner.get_best_models(1)[0]
