# Project: GenreGenius

This Project is a model that predicts the genre of the music that the user inputs, can be via a file or a recording

## Libraries

In [1]:
import os 
import librosa
import math
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import keras_tuner as kt
import pickle
from scipy import stats
import random
import sounddevice as sd

## Feature Extraction from Sound Files

In [9]:
# Path for Dataset and path to save JSON file later
DATASET_PATH = "genres"
JSON_PATH = "data.json"

In [10]:
# Set the sample rate to 22050 samples per second
SAMPLE_RATE = 22050 #Sample rate refers to the number of samples, or measurements, taken per second to represent an audio signal.

# Set the desired duration of the audio track to 30 seconds
DURATION = 30

# Calculate the total number of samples in the audio track
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [None]:
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length= 512, num_segments=5):
    # dictionary to store data 
    data= { 
        "mapping": [],
        "mfcc": [],
        "labels": [],
       }
    
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment/hop_length)

    # loop through all the genres
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        #ensure that we are not at the root level
        if dirpath is not dataset_path:
            #save semantic label 
            dirpath_components = dirpath.split("/7") # genre/blues => ["genre", "blues"]
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))

            for f in filenames:
                #load the audio file
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)
                # PROCESS segments extrating mfccs and storing data 
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment

                    #store mfcc for segment if it has the expected lenght
                    mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], n_mfcc=n_mfcc, n_fft = n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, s))
    with open (JSON_PATH, "w") as fp:
        json.dump(data, fp, indent=4)


In [None]:
# Save the features 
save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)

## Create the CNN model

In [11]:
#Load the data
def load_data(dataset_path):
    with open(dataset_path, "r") as fp:
        data = json.load(fp)

    inputs = np.array(data["mfcc"])
    targets = np.array(data["labels"])

    return inputs, targets

In [12]:
#Prepare the Train,test and validation sets
def prepare_datasets(test_size, validation_size):
   inputs, target = load_data(JSON_PATH)
   X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size= test_size)
   X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size= validation_size)

   X_train = X_train[..., np.newaxis]
   X_validation = X_validation[..., np.newaxis]
   X_test = X_test[..., np.newaxis]

   return X_train, X_validation, X_test, y_train, y_validation, y_test

In [None]:
def build_model(input_shape):
    #create model 
    model = keras.Sequential()
    #1st conv layer
    model.add(keras.layers.Conv2D(32,(3,3), activation="relu", input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #2nd conv layer
    model.add(keras.layers.Conv2D(32,(3,3), activation="relu", input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #3rd conv layer

    model.add(keras.layers.Conv2D(32,(2,2), activation="relu", input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((2,2), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #flatten the output and feed into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dropout(0.3))
    #output layer
    model.add(keras.layers.Dense(10, activation="softmax"))
    return model

In [25]:
def predict(model, X, y):
    X = X[np.newaxis, ...]
    prediction = model.predict(X) # X-> (1, 130,13,1)

    #extract index with max value
    predict_index = np.argmax(prediction, axis=1) #
    print("Expected index: {}, Predicted index:  {}".format(y, predict_index))
    return predict_index

In [13]:
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.2)  

In [None]:
input_shape = (X_train.shape[1],X_train.shape[2],X_train.shape[3])
model = build_model(input_shape)

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer= optimizer,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
model.fit(X_train, y_train, validation_data=(X_validation,y_validation), batch_size=32, epochs=30)

In [None]:
test_error, test_accuracy= model.evaluate(X_test, y_test, verbose = 1)

In [None]:
print("Accuracy on test set is: {}".format(test_accuracy))

In [None]:
X = X_test[50]
y = y_test[50]

predict(model, X, y)

In [None]:
# save model
pickle.dump(model, open('model.pkl', 'wb'))

## Parameter Tunning

In [5]:
def tune_model(hp):
    #create model 
    model = keras.Sequential()
    
    hp_activation = hp.Choice('activation', values=['relu', 'tanh'])
    hp_filters_1 = hp.Int('filters_1', min_value=20, max_value=40, step=2)
    hp_filters_2 = hp.Int('filters_1', min_value=20, max_value=40, step=2)
    hp_filters_3 = hp.Int('filters_1', min_value=20, max_value=40, step=2)
    hp_layer_1 = hp.Int('layer_1', min_value=50, max_value=100, step=2)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    #1st conv layer
    model.add(keras.layers.Conv2D(hp_filters_1,(3,3), activation="relu", input_shape=(130,13,1)))
    model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #2nd conv layer
    model.add(keras.layers.Conv2D(hp_filters_2,(3,3), activation="relu", input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #3rd conv layer

    model.add(keras.layers.Conv2D(hp_filters_3,(2,2), activation="relu", input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((2,2), strides=(2,2), padding= "same"))
    model.add(keras.layers.BatchNormalization())
    #flatten the output and feed into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(hp_layer_1, activation="relu"))
    model.add(keras.layers.Dropout(0.3))
    #output layer
    model.add(keras.layers.Dense(10, activation="softmax"))

    optimizer = keras.optimizers.Adam(learning_rate=hp_learning_rate)
    model.compile(optimizer= optimizer,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

    return model

In [6]:
tuner = kt.Hyperband(tune_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='dir',
                     project_name='x')

NameError: name 'input_shape' is not defined

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2,
                    callbacks=[stop_early])

## Predicting New input

In [20]:
import pygame
#play music
def play_sound(file_sample):
    pygame.init()
    pygame.mixer.init()

    pygame.mixer.music.load(file_sample)
    start_position = 45  # Starting position in seconds
    pygame.mixer.music.play(start=start_position)

    duration = 7  # Duration to play in seconds
    pygame.time.delay(int(duration * 1000))  # Delay for the specified duration

    pygame.mixer.music.stop()

In [34]:
def process_new_song(file_sample): 
  # dictionary to store data 
    data= { 
        "mfcc": []
        
       }
    duration = 60
    SAMPLE_RATE = 22050
    SAMPLES_PER_TRACK = SAMPLE_RATE * duration
    n_fft = 2048 
    hop_length = 512
    num_samples_per_segment = int(SAMPLES_PER_TRACK / 20)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment/hop_length)
    file_path = file_sample
    signal, sr = librosa.load(file_path, sr = SAMPLE_RATE,offset=150 ,duration=duration)
                # PROCESS segments extrating mfccs and storing data 
    for s in range(40):
        start_sample = num_samples_per_segment * s
        finish_sample = start_sample + num_samples_per_segment

        #store mfcc for segment if it has the expected lenght
        mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], n_mfcc=13, n_fft = 2048, hop_length=512)
        mfcc = mfcc.T
        if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                data["mfcc"].append(mfcc.tolist())
                #print("{}, segment:{}".format(file_path, s))

    inputs = np.array(data["mfcc"])
    inputs = inputs[..., np.newaxis]
    
    return inputs


In [35]:
import pickle
model = pickle.load(open('model.pkl','rb'))

In [2]:
def predict_new(model, X):
    X = X[np.newaxis, ...]
    prediction = model.predict(X) # X-> (1, 130,13,1)

    #extract index with max value
    predict_index = np.argmax(prediction, axis=1) #
    print("Predicted index:  {}".format(predict_index))
    return predict_index

In [3]:
def extract_numbers(data):
    numbers = []
    for arr in data:
        numbers.append(arr.tolist())
    
    mode = stats.mode(numbers, keepdims=True)
    return mode.mode[0]

In [66]:
def final_score(sound_file):
    genres_list = ["Blues","Classical","Country", "Disco", "Hiphop", "Jazz","Metal","Pop","Raggae","Rock"]
    inputs = process_new_song(sound_file)
    prediction = []

    for i in range(6):
        pred = predict_new(model, inputs[i+1])
        prediction.append(pred[0])

        result = extract_numbers(prediction)
    play_sound(sound_file)
    return genres_list[result]

In [67]:
#file_path = input("Enter the file path:")
#result = final_score(r"{}".format(file_path))
result = final_score(r"C:\Users\Bruno Santos\Desktop\Iron Hack - Semanas\Final Project\FINAL\Full Project\test\test\J. Cole - MIDDLE CHILD (Official Music Video).wav")
result

Predicted index:  [4]
Predicted index:  [8]
Predicted index:  [4]
Predicted index:  [7]
Predicted index:  [7]
Predicted index:  [4]


'Hiphop'

## Predict a Recorded Sound

In [4]:
def process_recorded_song():
        data= { 
        "mfcc": []
        
       }
        duration = 30
        SAMPLE_RATE = 22050
        SAMPLES_PER_TRACK = SAMPLE_RATE * duration
        n_fft = 2048 
        hop_length = 512
        num_samples_per_segment = int(SAMPLES_PER_TRACK / 10)
        expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment/hop_length)
        print("Recording started...")

        # Record audio
        audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, blocking=True)
        audio = audio.flatten()
        print("Recording finished.")
        for s in range(10):
                start_sample = num_samples_per_segment * s
                finish_sample = start_sample + num_samples_per_segment

                #store mfcc for segment if it has the expected lenght
                mfcc = librosa.feature.mfcc(y=audio[start_sample:finish_sample], n_mfcc=13, hop_length=512)
                mfcc = mfcc.T
                if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        #print("{}, segment:{}".format(s))

        inputs = np.array(data["mfcc"])
        inputs = inputs[..., np.newaxis]
    
        return inputs

In [5]:
import pickle
model = pickle.load(open('model.pkl','rb'))

In [6]:
def final_score(model):
    genres_list = ["Blues","Classical","Country", "Disco", "Hiphop", "Jazz","Metal","Pop","Raggae","Rock"]
    inputs = process_recorded_song()
    prediction = []
    for i in range(7):
        pred = predict_new(model, inputs[i])
        prediction.append(pred[0])

        result = extract_numbers(prediction)
    return genres_list[result]

In [8]:
final_score(model)

Recording started...
Recording finished.
Predicted index:  [9]
Predicted index:  [4]
Predicted index:  [4]
Predicted index:  [7]
Predicted index:  [4]
Predicted index:  [8]
Predicted index:  [8]


'Hiphop'

## Evaluate the model

In [51]:
import pickle
model = pickle.load(open('model.pkl','rb'))

In [52]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  
y_pred
 

report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.95      0.88      0.92       263
           1       0.98      0.93      0.96       276
           2       0.78      0.92      0.84       255
           3       0.92      0.88      0.90       215
           4       0.85      0.91      0.88       246
           5       0.86      0.95      0.90       232
           6       0.95      0.96      0.96       256
           7       0.85      0.96      0.90       251
           8       0.94      0.79      0.86       259
           9       0.88      0.74      0.80       246

    accuracy                           0.89      2499
   macro avg       0.90      0.89      0.89      2499
weighted avg       0.90      0.89      0.89      2499



In [None]:
#bimode