In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Directory where your audio files are stored
audio_dir = 'path_to_your_audio_files/'

# List of instrument labels
instrument_labels = ['Piano', 'Clarinet', 'AcGuitar', 'ElGuitar', 'Saxophone', 'Keyboard', 'Violin', 'Trumpet', 'Voice']

# Function to convert audio to Mel spectrogram
def audio_to_mel_spectrogram(audio_file, sr=22050, n_mels=128, duration=5, offset=0):
    y, sr = librosa.load(audio_file, sr=sr, duration=duration, offset=offset)
    mel_spec = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

# Prepare the dataset
def prepare_dataset(audio_dir, labels, img_size=(128, 128)):
    images = []
    labels_list = []

    for label in labels:
        label_dir = os.path.join(audio_dir, label)
        for audio_file in os.listdir(label_dir):
            audio_path = os.path.join(label_dir, audio_file)
            mel_spec_db = audio_to_mel_spectrogram(audio_path)

            # Resize spectrogram to match CNN input size (img_size)
            mel_spec_resized = cv2.resize(mel_spec_db, img_size)  # cv2 used to resize the image

            images.append(mel_spec_resized)
            labels_list.append(labels.index(label))

    # Convert to numpy arrays
    images = np.array(images)
    labels_list = np.array(labels_list)

    # Normalize image values
    images = images / np.max(images)

    return images, labels_list

# Prepare dataset
X, y = prepare_dataset(audio_dir, instrument_labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CNN model
model = models.Sequential([
    layers.InputLayer(input_shape=(128, 128, 1)),  # Image shape (128x128, 1 channel)
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(instrument_labels), activation='softmax')  # Output layer for classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

# Save the model
model.save('instrument_classification_model.h5')