In [1]:
# Importing essential libraries for audio processing, data handling, and machine learning
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D


Helper Functions

In [3]:
# Function to extract features (Mel Spectrogram) from audio files
def extract_features(file_path, n_mfcc=40):
    """
    Extract Mel-frequency cepstral coefficients (MFCC) from an audio file.

    Parameters:
    - file_path (str): Path to the audio file.
    - n_mfcc (int): Number of MFCC features to extract.

    Returns:
    - mfcc (ndarray): Extracted MFCC features.
    """
    audio, sr = librosa.load(file_path, sr=None)  # Load the audio file
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features
    return mfcc


Data Loading and Preprocessing

In [11]:
# Directory containing audio files organized by class (e.g., guitar notes)
DATA_DIR = 'C:/Users/CJHx6/OneDrive/AI_Class/GIT/Group_3_Project/Resources/guitar-notes/Notes Datasets'

# Function to extract features from audio files (as you did earlier)
def extract_features(file_path):
    # Load audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Extract features (you can customize this based on your model needs)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Example feature extraction (MFCCs)
    
    # Return the extracted features
    return mfccs

# Collecting all audio files and their labels
file_paths = []
labels = []
for dirpath, dirnames, filenames in os.walk(DATA_DIR):
    for file in filenames:
        if file.endswith(".wav"):
            file_paths.append(os.path.join(dirpath, file))
            labels.append(os.path.basename(dirpath))  # Assuming folder names are the class labels

# Encoding labels into numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Extracting features for all audio files
features = []
for file_path in file_paths:
    features.append(extract_features(file_path))

# Convert features to numpy array and pad for uniform input shape
features = [librosa.util.fix_length(feature, size=500, axis=1) for feature in features]
X = np.array(features)
X = X[..., np.newaxis]  # Adding channel dimension for CNN

# Convert labels to numpy array
y = np.array(encoded_labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

Training set shape: (1816, 13, 500, 1), Testing set shape: (454, 13, 500, 1)


Model Architecture

In [15]:
from keras.layers import Input

# Building a Convolutional Neural Network (CNN)
model = Sequential()

# Add Input layer to define input shape
model.add(Input(shape=(X_train.shape[1], X_train.shape[2], 1)))  # Define input shape here

# First convolutional layer
model.add(Conv2D(32, kernel_size=(3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

# Second convolutional layer
model.add(Conv2D(64, kernel_size=(3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

# Flatten and add dense layers
model.add(Flatten())
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_)))  # Output layer for classification
model.add(Activation('softmax'))

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Model Training

In [19]:
# Training the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Saving the trained model for future use
model.save('guitar_note_classifier.keras')


Epoch 1/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4870 - loss: 1.4301 - val_accuracy: 0.4604 - val_loss: 1.6661
Epoch 2/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5189 - loss: 1.3601 - val_accuracy: 0.5242 - val_loss: 1.5988
Epoch 3/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5312 - loss: 1.2945 - val_accuracy: 0.5308 - val_loss: 1.5777
Epoch 4/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5683 - loss: 1.2733 - val_accuracy: 0.5044 - val_loss: 1.6293
Epoch 5/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5665 - loss: 1.2281 - val_accuracy: 0.5022 - val_loss: 1.5763
Epoch 6/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5480 - loss: 1.2422 - val_accuracy: 0.5198 - val_loss: 1.5616
Epoch 7/100
[1m57/57[0m [

Prediction and Visualization

In [21]:
# Predicting on the test set
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)

# Confusion matrix to evaluate performance
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predicted_labels))
print(classification_report(y_test, predicted_labels, target_names=label_encoder.classes_))

# Function to predict the class of a new audio file
def predict_audio(file_path):
    """
    Predicts the label of a given audio file.

    Parameters:
    - file_path (str): Path to the audio file.

    Returns:
    - predicted_label (str): Predicted label for the audio file.
    """
    feature = extract_features(file_path)
    feature = librosa.util.fix_length(feature, size=500, axis=1)[..., np.newaxis]
    feature = np.expand_dims(feature, axis=0)  # Adding batch dimension
    prediction = model.predict(feature)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_label[0]

# Example usage:
# print(predict_audio('path_to_new_audio_file.wav'))


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[[10  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  1  0]
 [ 0  3  4 ...  0  0  0]
 ...
 [ 0  0  2 ...  8  0  0]
 [ 0  1  0 ...  0  8  1]
 [ 0  0  0 ...  0  0  8]]
               precision    recall  f1-score   support

         1. E       0.77      0.91      0.83        11
  10. C-sharp       0.28      0.38      0.32        13
        11. D       0.57      0.36      0.44        11
  12. D-sharp       0.56      0.29      0.38        17
       13. E1       1.00      0.87      0.93        15
       14. F1       0.62      0.62      0.62        13
15. F-sharp 1       0.71      0.92      0.80        13
       16. G1       0.64      0.39      0.48        18
17. G-sharp 1       0.92      0.85      0.88        13
       18. A1       0.42      0.62      0.50        13
19. A-sharp 1       0.55      0.43      0.48        14
         2. F       0.31      0.64      0.42        14
       20. B1       0.62      0.50      0.56        10


Future Steps

In [None]:
# Adding functionality to detect and separate guitar notes from songs (to be expanded)
# Example: Using audio source separation (librosa's harmonic-percussive source separation)
def extract_guitar_notes(audio_file):
    """
    Extract guitar-like harmonic notes from a mixed audio file.

    Parameters:
    - audio_file (str): Path to the mixed audio file.

    Returns:
    - harmonic (ndarray): Harmonic component of the audio, likely containing guitar sounds.
    """
    y, sr = librosa.load(audio_file, sr=None)
    harmonic, _ = librosa.effects.hpss(y)  # Harmonic-Percussive Source Separation
    return harmonic
