In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
import librosa as lb
import pandas as pd
import librosa.display
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelBinarizer, StandardScaler


In [2]:
def extract_features(main_dir, list_of_features):
    features_list = []
    labels = []

    for subfolder in os.listdir(main_dir):
        subfolder_path = os.path.join(main_dir, subfolder)
        if os.path.isdir(subfolder_path):
            # Iterate over each audio file in the subfolder
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.wav'):
                    file_path = os.path.join(subfolder_path, filename)
                    
                    y, sr = librosa.load(file_path)

                    features = []
                    if 'mfcc' in list_of_features:
                        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                        features.extend(mfcc.mean(axis=1))

                    if 'chroma' in list_of_features:
                        stft = np.abs(librosa.stft(y))
                        chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
                        features.extend(chroma.mean(axis=1))

                    if 'melspectrogram' in list_of_features:
                        mel_spect = librosa.feature.melspectrogram(y=y, sr=sr)
                        mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
                        features.extend(mel_spect_db.mean(axis=1))

                    features_list.append(features)
                    labels.append(filename[7])  

    return features_list, labels


In [3]:
def load_data(test_size1=0.2):
    filename = r'C:\Users\pashu\OneDrive\Desktop\speech_recong_project\Audio_Speech_Actors_01-24'
    filename1 = r'C:\Users\pashu\OneDrive\Desktop\speech_recong_project\Audio_Song_Actors_01-24'
    listf = ['mfcc', 'chroma', 'melspectrogram']
    
    features, labels = extract_features(main_dir=filename, list_of_features=listf)
    features1, labels1 = extract_features(main_dir=filename1, list_of_features=listf)
    
    features_array = np.array(features, dtype=np.float32)
    labels_array = np.array(labels)
    features_array1 = np.array(features1, dtype=np.float32)
    labels_array1 = np.array(labels1)
    
    features_array2 = np.concatenate((features_array, features_array1), axis=0)
    labels_array2 = np.concatenate((labels_array, labels_array1), axis=0)
    
    scaler = StandardScaler()
    features_array2 = scaler.fit_transform(features_array2)
    
    lb = LabelBinarizer()
    labels_one_hot = lb.fit_transform(labels_array2)
    
    X_train, X_test, y_train, y_test = train_test_split(features_array2, labels_one_hot, test_size=test_size1, random_state=42)
    return X_train, X_test, y_train, y_test, lb



In [4]:
X_train, X_test, y_train, y_test, lb = load_data()

In [5]:
model = tf.keras.Sequential([
    Dense(300, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(y_train.shape[1], activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  
model.fit(X_train, y_train, epochs=200, batch_size=256, validation_split=0.1)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_accuracy*100:.4f}')

Epoch 1/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.1984 - loss: 2.1205 - val_accuracy: 0.2690 - val_loss: 1.8023
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.3603 - loss: 1.6601 - val_accuracy: 0.3604 - val_loss: 1.6762
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4682 - loss: 1.4510 - val_accuracy: 0.4061 - val_loss: 1.5373
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5312 - loss: 1.3072 - val_accuracy: 0.4365 - val_loss: 1.4403
Epoch 5/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5802 - loss: 1.2407 - val_accuracy: 0.4619 - val_loss: 1.3586
Epoch 6/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6130 - loss: 1.1797 - val_accuracy: 0.5127 - val_loss: 1.3013
Epoch 7/200
[1m7/7[0m [32m━━━━━━━━━━━

In [7]:
print(f'Test Accuracy: {test_accuracy*100:.4f}')

Test Accuracy: 75.7638


In [8]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
print(classification_report(y_test_labels, y_pred_labels))

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.82      0.74      0.78        38
           1       0.78      0.80      0.79        81
           2       0.84      0.66      0.74        73
           3       0.67      0.83      0.74        71
           4       0.83      0.83      0.83        69
           5       0.83      0.74      0.78        80
           6       0.62      0.64      0.63        45
           7       0.64      0.79      0.71        34

    accuracy                           0.76       491
   macro avg       0.75      0.75      0.75       491
weighted avg       0.77      0.76      0.76       491

