# RNN Testing

An RNN (recurrent neural network) is a deep learning model that is trained to process and convert a sequential data input into a specific sequential data output, particularly into a time series.

For music classification, using such databases as the GTZAN dataset, it can sometimes be used as a tool to grab and study useful features - features that are different between different genres.

## 1 - Do the imports

In [5]:
import os
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For plotting
import gc # For garbage collection
import librosa # For audio processing

import sys

## 2 - Set up the basepath and the genres

In [6]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

## 3 - Begin the 30 second clip extraction

In [7]:
def featureExtraction(filepath, sr=22050, n_mfcc=13, duration=30):
	y, sr = librosa.load(filepath, sr=sr, duration=duration)
	mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
	return mfcc.T

features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue
		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
			mfcc = featureExtraction(filePath)
			print(f"MFCC for {genre} extracted with shape: {mfcc.shape}")
			if mfcc.shape[0] < 1300:
				mfcc = np.pad(mfcc, ((0, 1300 - mfcc.shape[0]), (0, 0)), mode='constant')
			else:
				mfcc = mfcc[:1300]
			features.append(mfcc)
			labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

./Data/genres_original/blues/blues.00093.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00087.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00050.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00044.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00078.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00079.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00045.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00051.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00086.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00092.wav
MFCC for blues extracted with shape: (1292, 13)
./Data/genres_original/blues/blues.00084.wav
MFCC for blues extracted 

(800, 200, 800, 200)

## 4 - Create the RNN for the audio, and then run and compile it

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1s/step - accuracy: 0.1938 - loss: 2.2283 - val_accuracy: 0.3550 - val_loss: 1.9486
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.3927 - loss: 1.8300 - val_accuracy: 0.4300 - val_loss: 1.7570
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1s/step - accuracy: 0.4274 - loss: 1.6475 - val_accuracy: 0.4200 - val_loss: 1.5833
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1s/step - accuracy: 0.5255 - loss: 1.4227 - val_accuracy: 0.4450 - val_loss: 1.5428
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1s/step - accuracy: 0.5363 - loss: 1.3482 - val_accuracy: 0.4950 - val_loss: 1.4416
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1s/step - accuracy: 0.5492 - loss: 1.2339 - val_accuracy: 0.5000 - val_loss: 1.4411
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━

## 5 - Grab the accuracy of said 30-second RNN

In [9]:
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

Final training accuracy: 0.8400


## 6 - Grab the 3-second clip extraction

In [10]:
features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue

		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
      
			y, sr = librosa.load(filePath, sr=22050, duration=30)
			y = librosa.util.fix_length(y, size=22050 * 30)  # Ensure fixed length for consistency
			segments = np.array_split(y, 10)
   
			for segment in segments:
				mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
				mfccT = mfcc.T
				if mfccT.shape[0] < 1300:
					mfccT = np.pad(mfccT, ((0, 1300 - mfccT.shape[0]), (0, 0)), mode='constant')
				else:
					mfccT = mfccT[:1300]
				features.append(mfccT)
				labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

./Data/genres_original/blues/blues.00093.wav
./Data/genres_original/blues/blues.00087.wav
./Data/genres_original/blues/blues.00050.wav
./Data/genres_original/blues/blues.00044.wav
./Data/genres_original/blues/blues.00078.wav
./Data/genres_original/blues/blues.00079.wav
./Data/genres_original/blues/blues.00045.wav
./Data/genres_original/blues/blues.00051.wav
./Data/genres_original/blues/blues.00086.wav
./Data/genres_original/blues/blues.00092.wav
./Data/genres_original/blues/blues.00084.wav
./Data/genres_original/blues/blues.00090.wav
./Data/genres_original/blues/blues.00047.wav
./Data/genres_original/blues/blues.00053.wav
./Data/genres_original/blues/blues.00052.wav
./Data/genres_original/blues/blues.00046.wav
./Data/genres_original/blues/blues.00091.wav
./Data/genres_original/blues/blues.00085.wav
./Data/genres_original/blues/blues.00081.wav
./Data/genres_original/blues/blues.00095.wav
./Data/genres_original/blues/blues.00042.wav
./Data/genres_original/blues/blues.00056.wav
./Data/gen

(8000, 2000, 8000, 2000)

## 7 - Create the 3-second neural network

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [12]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 1s/step - accuracy: 0.3462 - loss: 1.8410 - val_accuracy: 0.5185 - val_loss: 1.3340
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 1s/step - accuracy: 0.5079 - loss: 1.3479 - val_accuracy: 0.5695 - val_loss: 1.2250
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 1s/step - accuracy: 0.5580 - loss: 1.2175 - val_accuracy: 0.5905 - val_loss: 1.1487
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 995ms/step - accuracy: 0.6096 - loss: 1.0987 - val_accuracy: 0.6225 - val_loss: 1.0882
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 1s/step - accuracy: 0.6346 - loss: 1.0068 - val_accuracy: 0.6595 - val_loss: 0.9795
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 989ms/step - accuracy: 0.6851 - loss: 0.8954 - val_accuracy: 0.6630 - val_loss: 0.9510
Epoch 7/20
[1m2

## 8 - Grab the accuracy of the 3-second neural network

In [13]:
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

Final training accuracy: 0.8861


## 9 - 5-second clips

In [14]:
features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue

		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
      
			y, sr = librosa.load(filePath, sr=22050, duration=30)
			y = librosa.util.fix_length(y, size=22050 * 30)  # Ensure fixed length for consistency
			segments = np.array_split(y, 5)
   
			for segment in segments:
				mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
				mfccT = mfcc.T
				if mfccT.shape[0] < 1300:
					mfccT = np.pad(mfccT, ((0, 1300 - mfccT.shape[0]), (0, 0)), mode='constant')
				else:
					mfccT = mfccT[:1300]
				features.append(mfccT)
				labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

./Data/genres_original/blues/blues.00093.wav
./Data/genres_original/blues/blues.00087.wav
./Data/genres_original/blues/blues.00050.wav
./Data/genres_original/blues/blues.00044.wav
./Data/genres_original/blues/blues.00078.wav
./Data/genres_original/blues/blues.00079.wav
./Data/genres_original/blues/blues.00045.wav
./Data/genres_original/blues/blues.00051.wav
./Data/genres_original/blues/blues.00086.wav
./Data/genres_original/blues/blues.00092.wav
./Data/genres_original/blues/blues.00084.wav
./Data/genres_original/blues/blues.00090.wav
./Data/genres_original/blues/blues.00047.wav
./Data/genres_original/blues/blues.00053.wav
./Data/genres_original/blues/blues.00052.wav
./Data/genres_original/blues/blues.00046.wav
./Data/genres_original/blues/blues.00091.wav
./Data/genres_original/blues/blues.00085.wav
./Data/genres_original/blues/blues.00081.wav
./Data/genres_original/blues/blues.00095.wav
./Data/genres_original/blues/blues.00042.wav
./Data/genres_original/blues/blues.00056.wav
./Data/gen

Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 973ms/step - accuracy: 0.2696 - loss: 2.0281 - val_accuracy: 0.4720 - val_loss: 1.4807
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 955ms/step - accuracy: 0.4534 - loss: 1.4935 - val_accuracy: 0.4950 - val_loss: 1.3647
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 958ms/step - accuracy: 0.5248 - loss: 1.3251 - val_accuracy: 0.5240 - val_loss: 1.2878
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 1s/step - accuracy: 0.5408 - loss: 1.2539 - val_accuracy: 0.5590 - val_loss: 1.2019
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 1s/step - accuracy: 0.5862 - loss: 1.1556 - val_accuracy: 0.5680 - val_loss: 1.1838
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1s/step - accuracy: 0.5832 - loss: 1.1431 - val_accuracy: 0.5860 - val_loss: 1.1571
Epoch 7/20
[

## 10 - 30 second clips (metal and classical)

In [15]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["classical", "metal"]

def featureExtraction(filepath, sr=22050, n_mfcc=13, duration=30):
	y, sr = librosa.load(filepath, sr=sr, duration=duration)
	mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
	return mfcc.T

features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue
		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
			mfcc = featureExtraction(filePath)
			print(f"MFCC for {genre} extracted with shape: {mfcc.shape}")
			if mfcc.shape[0] < 1300:
				mfcc = np.pad(mfcc, ((0, 1300 - mfcc.shape[0]), (0, 0)), mode='constant')
			else:
				mfcc = mfcc[:1300]
			features.append(mfcc)
			labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

./Data/genres_original/classical/classical.00079.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00045.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00051.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00086.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00092.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00093.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00087.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00050.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00044.wav
MFCC for classical extracted with shape: (1292, 13)
./Data/genres_original/classical/classical.00078.wav
MF

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.6058 - loss: 0.6352 - val_accuracy: 0.9750 - val_loss: 0.2707
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9557 - loss: 0.2597 - val_accuracy: 0.9750 - val_loss: 0.1036
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9416 - loss: 0.1580 - val_accuracy: 1.0000 - val_loss: 0.0383
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9687 - loss: 0.0734 - val_accuracy: 1.0000 - val_loss: 0.0194
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.9966 - loss: 0.0343 - val_accuracy: 1.0000 - val_loss: 0.0066
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9850 - loss: 0.0288 - val_accuracy: 1.0000 - val_loss: 0.0038
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

## 11 - 30 second accuracy - Disco and Pop

In [16]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["disco", "pop"]

def featureExtraction(filepath, sr=22050, n_mfcc=13, duration=30):
	y, sr = librosa.load(filepath, sr=sr, duration=duration)
	mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
	return mfcc.T

features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue
		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
			mfcc = featureExtraction(filePath)
			print(f"MFCC for {genre} extracted with shape: {mfcc.shape}")
			if mfcc.shape[0] < 1300:
				mfcc = np.pad(mfcc, ((0, 1300 - mfcc.shape[0]), (0, 0)), mode='constant')
			else:
				mfcc = mfcc[:1300]
			features.append(mfcc)
			labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

./Data/genres_original/disco/disco.00052.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00046.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00091.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00085.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00084.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00090.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00047.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00053.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00045.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00051.wav
MFCC for disco extracted with shape: (1292, 13)
./Data/genres_original/disco/disco.00079.wav
MFCC for disco extracted 

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - accuracy: 0.4954 - loss: 0.6946 - val_accuracy: 0.8250 - val_loss: 0.5544
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.8205 - loss: 0.5133 - val_accuracy: 0.8000 - val_loss: 0.4286
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.8529 - loss: 0.4082 - val_accuracy: 0.8500 - val_loss: 0.3575
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.8666 - loss: 0.3216 - val_accuracy: 0.9000 - val_loss: 0.2822
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.8977 - loss: 0.2503 - val_accuracy: 0.9000 - val_loss: 0.2432
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9856 - loss: 0.1154 - val_accuracy: 0.8750 - val_loss: 0.2128
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

## 12 - 3 second accuracy (classical and metal)

In [17]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["classical", "metal"]

features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue

		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
      
			y, sr = librosa.load(filePath, sr=22050, duration=30)
			y = librosa.util.fix_length(y, size=22050 * 30)  # Ensure fixed length for consistency
			segments = np.array_split(y, 10)
   
			for segment in segments:
				mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
				mfccT = mfcc.T
				if mfccT.shape[0] < 1300:
					mfccT = np.pad(mfccT, ((0, 1300 - mfccT.shape[0]), (0, 0)), mode='constant')
				else:
					mfccT = mfccT[:1300]
				features.append(mfccT)
				labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

./Data/genres_original/classical/classical.00079.wav
./Data/genres_original/classical/classical.00045.wav
./Data/genres_original/classical/classical.00051.wav
./Data/genres_original/classical/classical.00086.wav
./Data/genres_original/classical/classical.00092.wav
./Data/genres_original/classical/classical.00093.wav
./Data/genres_original/classical/classical.00087.wav
./Data/genres_original/classical/classical.00050.wav
./Data/genres_original/classical/classical.00044.wav
./Data/genres_original/classical/classical.00078.wav
./Data/genres_original/classical/classical.00052.wav
./Data/genres_original/classical/classical.00046.wav
./Data/genres_original/classical/classical.00091.wav
./Data/genres_original/classical/classical.00085.wav
./Data/genres_original/classical/classical.00084.wav
./Data/genres_original/classical/classical.00090.wav
./Data/genres_original/classical/classical.00047.wav
./Data/genres_original/classical/classical.00053.wav
./Data/genres_original/classical/classical.000

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 1s/step - accuracy: 0.9195 - loss: 0.2700 - val_accuracy: 0.9875 - val_loss: 0.0391
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 1s/step - accuracy: 0.9979 - loss: 0.0138 - val_accuracy: 0.9875 - val_loss: 0.0503
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 995ms/step - accuracy: 0.9867 - loss: 0.0426 - val_accuracy: 0.9950 - val_loss: 0.0352
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 993ms/step - accuracy: 0.9968 - loss: 0.0108 - val_accuracy: 0.9925 - val_loss: 0.0402
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1s/step - accuracy: 0.9998 - loss: 0.0022 - val_accuracy: 0.9950 - val_loss: 0.0408
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1000ms/step - accuracy: 0.9991 - loss: 0.0071 - val_accuracy: 0.9950 - val_loss: 0.0389
Epoch 7/20
[1m50/50[0m [32m

## 13 - 3 second accuracy (disco and pop)

In [18]:
BASEPATH = os.path.join("./Data", "genres_original")
GENRES = ["disco", "pop"]

features = []
labels = []

for genre_idx, genre in enumerate(GENRES):
	genre_path = os.path.join(BASEPATH, genre)
	for file in os.listdir(genre_path):
		if not file.endswith(".wav"):
			continue

		filePath = os.path.join(genre_path, file)
		print(filePath)
		if filePath.endswith(".wav"):
      
			y, sr = librosa.load(filePath, sr=22050, duration=30)
			y = librosa.util.fix_length(y, size=22050 * 30)  # Ensure fixed length for consistency
			segments = np.array_split(y, 10)
   
			for segment in segments:
				mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
				mfccT = mfcc.T
				if mfccT.shape[0] < 1300:
					mfccT = np.pad(mfccT, ((0, 1300 - mfccT.shape[0]), (0, 0)), mode='constant')
				else:
					mfccT = mfccT[:1300]
				features.append(mfccT)
				labels.append(genre_idx)
  
X = np.array(features)
y = np.array(labels)

print(X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

model = Sequential([
	Masking(mask_value=0.0, input_shape=(1300, 13)),
	LSTM(128, return_sequences=True),
	Dropout(0.2),
	LSTM(64, return_sequences=False),
	Dropout(0.2),
	Dense(64, activation='relu'),
	Dense(len(GENRES), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
accuracy = history.history['accuracy']
print(f"Final training accuracy: {accuracy[-1]:.4f}")

./Data/genres_original/disco/disco.00052.wav
./Data/genres_original/disco/disco.00046.wav
./Data/genres_original/disco/disco.00091.wav
./Data/genres_original/disco/disco.00085.wav
./Data/genres_original/disco/disco.00084.wav
./Data/genres_original/disco/disco.00090.wav
./Data/genres_original/disco/disco.00047.wav
./Data/genres_original/disco/disco.00053.wav
./Data/genres_original/disco/disco.00045.wav
./Data/genres_original/disco/disco.00051.wav
./Data/genres_original/disco/disco.00079.wav
./Data/genres_original/disco/disco.00086.wav
./Data/genres_original/disco/disco.00092.wav
./Data/genres_original/disco/disco.00093.wav
./Data/genres_original/disco/disco.00087.wav
./Data/genres_original/disco/disco.00078.wav
./Data/genres_original/disco/disco.00050.wav
./Data/genres_original/disco/disco.00044.wav
./Data/genres_original/disco/disco.00068.wav
./Data/genres_original/disco/disco.00040.wav
./Data/genres_original/disco/disco.00054.wav
./Data/genres_original/disco/disco.00083.wav
./Data/gen

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 996ms/step - accuracy: 0.7590 - loss: 0.4854 - val_accuracy: 0.9025 - val_loss: 0.2444
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 986ms/step - accuracy: 0.9393 - loss: 0.1637 - val_accuracy: 0.9125 - val_loss: 0.2281
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 989ms/step - accuracy: 0.9635 - loss: 0.1139 - val_accuracy: 0.9225 - val_loss: 0.2068
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 987ms/step - accuracy: 0.9734 - loss: 0.0902 - val_accuracy: 0.9425 - val_loss: 0.1790
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 994ms/step - accuracy: 0.9834 - loss: 0.0637 - val_accuracy: 0.9250 - val_loss: 0.2069
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 984ms/step - accuracy: 0.9581 - loss: 0.1163 - val_accuracy: 0.9475 - val_loss: 0.1869
Epoch 7/20
[1m50/50[