In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

# Define model input shape for Mel spectrograms (assuming 128x128 patches for now)
input_shape = (128, 128, 1)  # 128x128 single-channel (grayscale) spectrogram patches

# Define model inputs
input_layer = Input(shape=input_shape)

# Convolutional layers with increasing filter sizes for depth
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)

x = Flatten()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)

# Output layer for onset detection (binary output for presence/absence of an onset)
onset_output = Dense(1, activation='sigmoid', name='onset_output')(x)

# Output layer for pitch classification with 49 classes (guitar range from E2 to E6)
pitch_output = Dense(49, activation='softmax', name='pitch_output')(x)

# Define the model with two outputs
model = Model(inputs=input_layer, outputs=[onset_output, pitch_output])

# Compile the model with binary cross-entropy for both onset and multi-label pitch detection
model.compile(optimizer='adam',
              loss={'onset_output': 'binary_crossentropy', 'pitch_output': 'binary_crossentropy'},
              metrics={'onset_output': 'accuracy', 'pitch_output': 'accuracy'})

# Model summary to confirm structure
model.summary()
