In [2]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Function to extract MFCC features from an audio file
def extract_features(audio_path, n_mfcc=13):
    try:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=22050)
        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        # Take the mean of the MFCC coefficients
        mfcc_scaled = np.mean(mfcc.T, axis=0)
        return mfcc_scaled
    except Exception as e:
        print(f"Error encountered while parsing file: {audio_path}")
        return None

# Function to prepare the dataset by extracting features and labels
def prepare_dataset(dataset_path, n_mfcc=13):
    features = []
    labels = []

    # Traverse each folder (class) in the dataset path
    for label, class_name in enumerate(os.listdir(dataset_path)):
        class_dir = os.path.join(dataset_path, class_name)
        if os.path.isdir(class_dir):
            # Traverse each audio file in the class directory
            for file_name in os.listdir(class_dir):
                file_path = os.path.join(class_dir, file_name)
                mfcc = extract_features(file_path, n_mfcc=n_mfcc)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(class_name)  # Use class name as label

    # Convert features and labels to numpy arrays
    X = np.array(features)
    y = np.array(labels)

    # Encode string labels into integers
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, label_encoder

# Load the dataset and prepare features and labels
dataset_path = 'dataset'  # Update this with the actual dataset path
X_train, X_test, y_train, y_test, label_encoder = prepare_dataset(dataset_path)

# Define the voice classification model using TensorFlow
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(13,)),  # 13 MFCC features
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')  # Number of output classes
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Display the model architecture
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")

# Save the trained model
model.save('voice_classification_model.h5')

# Optional: Convert predictions back to original class names
def predict_voice_class(audio_path):
    # Extract features from the input audio file
    mfcc = extract_features(audio_path)
    if mfcc is not None:
        # Reshape the features for prediction
        mfcc = mfcc.reshape(1, -1)
        # Make a prediction using the trained model
        prediction = model.predict(mfcc)
        # Get the index of the predicted class
        predicted_index = np.argmax(prediction)
        # Convert the index back to the original class label
        predicted_class = label_encoder.inverse_transform([predicted_index])
        return predicted_class[0]
    else:
        return "Error in feature extraction"

# Example: Predict the class of a new audio file
audio_file_path = 'dataset/female/03-01-01-01-01-01-02.wav'
predicted_class = predict_voice_class(audio_file_path)
print(f"Predicted voice class: {predicted_class}")


Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 108ms/step - accuracy: 0.5169 - loss: 17.5406 - val_accuracy: 0.5417 - val_loss: 7.5726
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5065 - loss: 9.6006 - val_accuracy: 0.5417 - val_loss: 7.8647
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5052 - loss: 6.2879 - val_accuracy: 0.4583 - val_loss: 8.9247
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5091 - loss: 7.5658 - val_accuracy: 0.4583 - val_loss: 5.8246
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4180 - loss: 4.3321 - val_accuracy: 0.5417 - val_loss: 4.8448
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.4948 - loss: 5.2548 - val_accuracy: 0.5417 - val_loss: 1.9504
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━



Test accuracy: 0.9583333134651184
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Predicted voice class: female
