In [13]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import math
import librosa
import tensorflow as tf
from collections import Counter

In [2]:
def load_data(dataset_path):
    # open file
    with open(dataset_path, "r") as fp:
        data = json.load(fp)

    # convert lists in to numpy arrays
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])

    return X, y

In [3]:
def prepare_datasets(test_size, valid_size):
    X, y = load_data("processed.json")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size)

    X_train = X_train[..., np.newaxis]
    X_valid = X_valid[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [4]:
X_train, X_valid, X_test, y_train, y_valid, y_test = prepare_datasets(0.25, 0.2)

In [6]:
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
model = keras.Sequential(
    [
        keras.layers.Input(shape=input_shape),
        keras.layers.Conv2D(32, (3, 3), activation="relu"),
        keras.layers.MaxPool2D((3,3), strides=(2,2), padding="same"),
        keras.layers.BatchNormalization(),
        
        keras.layers.Conv2D(32, (3, 3), activation="relu"),
        keras.layers.MaxPool2D((3,3), strides=(2,2), padding="same"),
        keras.layers.BatchNormalization(),
        
        keras.layers.Conv2D(32, (2, 2), activation="relu"),
        keras.layers.MaxPool2D((2,2), strides=(2,2), padding="same"),
        keras.layers.BatchNormalization(),
        
        keras.layers.Flatten(),
        
        keras.layers.Dense(units=64, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(units=10, activation="linear")
    ]
)
optimizer = keras.optimizers.Adam(0.0001)
model.compile(
    optimizer=optimizer,
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [7]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=32, epochs=30)

Epoch 1/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.1531 - loss: 2.6529 - val_accuracy: 0.3538 - val_loss: 1.8141
Epoch 2/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3292 - loss: 1.9313 - val_accuracy: 0.4619 - val_loss: 1.5281
Epoch 3/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4091 - loss: 1.6893 - val_accuracy: 0.4927 - val_loss: 1.4124
Epoch 4/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4368 - loss: 1.5839 - val_accuracy: 0.5274 - val_loss: 1.3060
Epoch 5/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4942 - loss: 1.4302 - val_accuracy: 0.5521 - val_loss: 1.2487
Epoch 6/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4999 - loss: 1.3822 - val_accuracy: 0.5774 - val_loss: 1.2059
Epoch 7/30
[1m188/188[0m

<keras.src.callbacks.history.History at 0x1c79cdc8d00>

In [8]:
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Accuracy: {test_accuracy}, Error: {test_error}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7081 - loss: 0.8526
Accuracy: 0.7052462697029114, Error: 0.8486840724945068


In [9]:
model.save("weights/cnn_weights.keras")

In [11]:
# Function to extract MFCCs from audio file
def extract_mfccs_from_audio(
    file_path,
    segment_duration=3,
    n_mfcc=13,
    n_fft=2048,
    hop_length=512,
    sample_rate=22050,
):
    signal, sr = librosa.load(file_path, sr=sample_rate)

    # Calculate the number of samples per segment
    samples_per_segment = sample_rate * segment_duration
    expected_vector_length = math.ceil(samples_per_segment / hop_length)

    mfccs = []
    num_segments = int(len(signal) / samples_per_segment)

    for s in range(num_segments):
        start_sample = samples_per_segment * s
        finish_sample = start_sample + samples_per_segment

        if finish_sample > len(signal):
            break

        mfcc = librosa.feature.mfcc(
            y=signal[start_sample:finish_sample],
            sr=sr,
            n_fft=n_fft,
            n_mfcc=n_mfcc,
            hop_length=hop_length,
        )
        mfcc = mfcc.T

        if len(mfcc) == expected_vector_length:
            mfccs.append(mfcc.tolist())

    return np.array(mfccs)

In [None]:
# Load the weights
model.load_weights("weights/cnn_weights.keras")

In [12]:
mapping = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]

In [14]:
mfccs = extract_mfccs_from_audio("sample songs/master.mp3")
predictions = model.predict(mfccs)

probabilities = tf.nn.softmax(predictions, axis=-1)

predicted_classes = np.argmax(probabilities, axis=1)

class_counts = Counter(predicted_classes)
most_common_class = mapping[class_counts.most_common(1)[0][0]]

print(f"The predicted class for the song is: {most_common_class}")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
The predicted class for the song is: metal
