In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.utils import to_categorical

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.append('../')
from vocal_patterns.ml_logic.preprocessor import preprocess_audio
from vocal_patterns.interface import main


In [2]:
## Model functions

def initialize_model(input_shape: tuple) -> Model:
    """
    Initialize the CNN model
    """
    model = Sequential()
    model.add(layers.Conv2D(8, (5,5), input_shape=input_shape, strides=(2, 2), padding='same', activation="relu"))
    model.add(layers.BatchNormalization())

    model.add(layers.Conv2D(16, (3,3), strides=(2, 2), padding='same', activation="relu"))
    model.add(layers.BatchNormalization())

    model.add(layers.Conv2D(32, (3,3), strides=(2, 2), padding='same', activation="relu"))
    model.add(layers.BatchNormalization())

    model.add(layers.Conv2D(64, (3,3), strides=(2, 2), padding='same', activation="relu"))
    model.add(layers.BatchNormalization())

    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(3, activation='softmax'))

    print("✅ Model initialized")

    return model

def compile_model(model: Model, learning_rate=0.001) -> Model:
    """
    Compile the Neural Network
    """
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    print("✅ Model compiled")

    return model


In [6]:
## Get the data

download_path = "../vocal_patterns/data/dataset_tags.csv"
data = pd.read_csv(download_path)
data.head()


Unnamed: 0,path,exercise,technique,filename
0,/Users/denis/code/ElsaGregoire/vocal_patterns/...,Other,vibrato,m6_row_vibrato.wav
1,/Users/denis/code/ElsaGregoire/vocal_patterns/...,Other,vibrato,m6_caro_vibrato.wav
2,/Users/denis/code/ElsaGregoire/vocal_patterns/...,Other,vibrato,m6_dona_vibrato.wav
3,/Users/denis/code/ElsaGregoire/vocal_patterns/...,Other,straight,m6_caro_straight.wav
4,/Users/denis/code/ElsaGregoire/vocal_patterns/...,Other,straight,m6_row_straight.wav


In [36]:
## Separate the data

X = data.drop(columns=['exercise', "technique", "filename"])
y = data[['exercise']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [26]:
X_train_processed = preprocess_audio(X_train)
X_train_reshaped = X_train_processed.reshape(len(X_train_processed), 128, 259, 1)

X_test_processed = preprocess_audio(X_test)
X_test_reshaped = X_test_processed.reshape(len(X_test_processed), 128, 259, 1)


In [27]:
label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(y_train)
y_train_cat = to_categorical(y_train_labels, num_classes=3)


  y = column_or_1d(y, warn=True)


In [20]:
np.ravel(y_train, order="c")


array(['arpeggios', 'scales', 'Other', ..., 'scales', 'scales',
       'arpeggios'], dtype=object)

In [28]:
y_test_labels = label_encoder.transform(y_test)
y_test_cat = to_categorical(y_test_labels, num_classes=3)


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [8]:
model = initialize_model((128, 259, 1))
model = compile_model(model)


✅ Model initialized
✅ Model compiled


In [9]:
model.fit(X_train_reshaped, y_train_cat, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10


2023-11-29 09:51:55.718440: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28d9a0970>

In [61]:
model.evaluate(X_test_reshaped, y_test_cat, batch_size=32)




[0.12265704572200775, 0.9612545967102051]

In [13]:
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the mapping
print("Category Mapping:")
for category, numerical_representation in category_mapping.items():
    print(f"{category}: {numerical_representation}")


Category Mapping:
Other: 0
arpeggios: 1
scales: 2


In [29]:
image_to_predict = np.expand_dims(X_test_reshaped[0], axis=0)
predictions = model.predict(image_to_predict)




In [30]:
predicted_indices = np.argmax(predictions, axis=1)
predicted_indices[0]


2

In [60]:
data.iloc[0, :]


path         /Users/denis/code/ElsaGregoire/vocal_patterns/...
exercise                                                 Other
technique                                              vibrato
filename                                    m6_row_vibrato.wav
Name: 0, dtype: object

In [65]:
X_train_processed.shape[1:]


(128, 259)

In [68]:
X_train_processed.shape[1]


128

In [23]:
model = main.train()


✅ Model initialized
✅ Model compiled
Restoring model weights from the end of the best epoch: 9.
Epoch 11: early stopping
✅ Results saved locally
✅ Model saved locally
0.9575645923614502


In [41]:
y_pred = main.predict(X_test)


[34m
Load latest model from local registry...[0m
[34m
Load latest model from disk...[0m
✅ Model loaded from local disk


In [42]:
y_pred


array([[2.5633091e-09, 7.0192131e-05, 9.9992979e-01],
       [7.2554840e-08, 2.4952164e-05, 9.9997497e-01],
       [6.8502915e-05, 9.9991441e-01, 1.7069622e-05],
       ...,
       [9.9999917e-01, 8.1291819e-07, 5.6886895e-09],
       [4.6224841e-03, 9.9446011e-01, 9.1742870e-04],
       [8.3670147e-06, 9.9999166e-01, 1.2424649e-08]], dtype=float32)

In [32]:
main.evaluate_model(model, X_test_reshaped, y_test_cat)


[34m
Evaluating model on 1084 rows...[0m
✅ Model evaluated, accuracy: 0.99


{'loss': 0.03662455826997757, 'accuracy': 0.9861623644828796}