In [39]:
import os
import librosa
import math
import json
from pathlib import Path
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
from collections import Counter

In [1]:
mapping= [
        "blues",
        "classical",
        "country",
        "disco",
        "hiphop",
        "jazz",
        "metal",
        "pop",
        "reggae",
        "rock"
    ]

In [32]:
# Recreate the model architecture
model = keras.Sequential(
    [
        keras.layers.Flatten(input_shape=(130, 13)),
        keras.layers.Dense(
            units=512,
            activation="relu",
            kernel_regularizer=keras.regularizers.l2(0.001),
        ),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(
            units=256,
            activation="relu",
            kernel_regularizer=keras.regularizers.l2(0.001),
        ),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(
            units=64, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)
        ),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(units=10, activation="linear"),
    ]
)

# Load the weights
model.load_weights("weights.keras")

In [36]:
# Function to extract MFCCs from audio file
def extract_mfccs_from_audio(
    file_path,
    segment_duration=3,
    n_mfcc=13,
    n_fft=2048,
    hop_length=512,
    sample_rate=22050,
):
    signal, sr = librosa.load(file_path, sr=sample_rate)

    # Calculate the number of samples per segment
    samples_per_segment = sample_rate * segment_duration
    expected_vector_length = math.ceil(samples_per_segment / hop_length)

    mfccs = []
    num_segments = int(len(signal) / samples_per_segment)

    for s in range(num_segments):
        start_sample = samples_per_segment * s
        finish_sample = start_sample + samples_per_segment

        if finish_sample > len(signal):
            break

        mfcc = librosa.feature.mfcc(
            y=signal[start_sample:finish_sample],
            sr=sr,
            n_fft=n_fft,
            n_mfcc=n_mfcc,
            hop_length=hop_length,
        )
        mfcc = mfcc.T

        if len(mfcc) == expected_vector_length:
            mfccs.append(mfcc.tolist())

    return np.array(mfccs)

In [70]:
mfccs = extract_mfccs_from_audio("sample songs/nirvana.mp3")

In [71]:
mfccs.shape

(72, 130, 13)

In [72]:
predictions = model.predict(mfccs)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [73]:
# Convert logits to probabilities
probabilities = tf.nn.softmax(predictions, axis=-1)

In [74]:
# Get the predicted class for each segment
predicted_classes = np.argmax(probabilities, axis=1)

In [75]:
# Aggregate the predictions
class_counts = Counter(predicted_classes)
most_common_class = mapping[class_counts.most_common(1)[0][0]]

print(f"The predicted class for the song is: {most_common_class}")

The predicted class for the song is: rock
